From 88da50faec3f273d2c40f46153bae43c4abd4836 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Jan 2022 15:52:52 -0800
Subject: [PATCH 001/279] accl: Adding src code for PushEngine.

---
 src/accl/push_engine.hh | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 src/accl/push_engine.hh
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
new file mode 100644
index 0000000000..eda9d7b707
--- /dev/null
+++ b/src/accl/push_engine.hh
@@ -0,0 +1,69 @@
+#ifndef __ACCL_PUSH_ENGINE_HH__
+#define __ACCL_PUSH_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/PushEngine.hh"
+#include "sim/clocked_object.hh"
+
+class PushEngine : public ClockedObject
+{
+  private:
+
+    class PushRespPort : public ResponsePort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class PushReqPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class PushMemPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    PushRespPort respPort;
+    PushReqPort reqPort;
+    PushMemPort memPort;
+
+    std::queue<PacketPtr> vertexQueue;
+    std::queue<PacketPtr> updateQueue;
+
+    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+
+};
+
+#endif // __ACCL_PUSH_ENGINE_HH__

From f94fd791ed92e43ef4a977218b41646815307b84 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Feb 2022 17:56:49 -0800
Subject: [PATCH 002/279] Adding implementation for PushEngine (wip).

---
 src/accl/push_engine.cc | 120 ++++++++++++++++++++++++++++++++++++++++
 src/accl/push_engine.hh |  63 ++++++++++++++++++++-
 src/accl/util.cc        |  16 ++++++
 src/accl/util.hh        |   4 ++
 4 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/push_engine.cc
 create mode 100644 src/accl/util.cc
 create mode 100644 src/accl/util.hh

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
new file mode 100644
index 0000000000..bc3138f61e
--- /dev/null
+++ b/src/accl/push_engine.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/push_engine.hh"
+
+#include "debug/PushEngine.hh"
+
+PushEngine::PushEngine(const PushEngineParams& params):
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
+    vertexQueueSize(params.vertex_queue_size),
+    vertexQueueLen(0),
+    updateQueue(params.update_queue_size),
+    updateQueueLen(0),
+    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
+    nextSendEvent([this]{ processNextSendEvent(); }, name())
+{}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleUpdate(pkt);
+}
+
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
+{
+    if (vertexQueueLen < vertexQueueSize) {
+        vertexQueue.push(pkt)
+        vertexQueueLen++;
+        return true;
+
+        if (!nextReceiveEvent.scheduled()){
+            schedule(nextReceiveEvent, nextCycle());
+        }
+    }
+    return false;
+}
+
+void
+PushEngine::processNextReceiveEvent()
+{
+    PacketPtr updatePkt = vertexQueue.pop();
+    uint8_t* data = updatePkt->getData<uint8_t>();
+
+    Addr edgeListAddr = ; // TODO: Generalize finding this address.
+    int outDegree = ; // TODO: Generalize finding this value.
+
+    Addr reqAddr = (edgeListAddr / 64) * 64;
+    Addr offsetAddr = edgeListAddr % 64;
+
+    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+
+    memPort.sendPacket(pkt);
+
+
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+
+}
+
+void
+PushEngine::processNextCreateEvent()
+{
+
+}
+
+void
+PushEngine::processNextSendEvent()
+{
+
+}
\ No newline at end of file
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index eda9d7b707..6ab902d0e2 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -1,8 +1,35 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #ifndef __ACCL_PUSH_ENGINE_HH__
 #define __ACCL_PUSH_ENGINE_HH__
 
 #include <queue>
-#include <unordered_map>
 
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
@@ -10,6 +37,7 @@
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
 #include "sim/clocked_object.hh"
+#include "sim/system.hh"
 
 class PushEngine : public ClockedObject
 {
@@ -18,6 +46,7 @@ class PushEngine : public ClockedObject
     class PushRespPort : public ResponsePort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -55,14 +84,42 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
-    PushRespPort respPort;
+    System* const system;
+    const RequestorID requestorId;
+
     PushReqPort reqPort;
+    PushRespPort respPort;
+
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
+    int vertexQueueSize;
+    int vertexQueueLen;
+
     std::queue<PacketPtr> updateQueue;
+    int updateQueueSize;
+    int updateQueueLen;
+
+    EventFunctionWrapper nextReceiveEvent;
+    void processNextReceiveEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextCreateEvent;
+    void processNextCreateEvent();
+
+    EventFunctionWrapper nextSendEvent;
+    void processNextSendEvent();
+
+    bool handleUpdate(PacketPtr pkt);
+
+  public:
+
+    PushEngine(const PushEngineParams &params);
 
-    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+    Port &getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 
 };
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
new file mode 100644
index 0000000000..20abd1c13a
--- /dev/null
+++ b/src/accl/util.cc
@@ -0,0 +1,16 @@
+#include "accl/util.hh"
+
+PacketPtr
+getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
new file mode 100644
index 0000000000..c621b9e45c
--- /dev/null
+++ b/src/accl/util.hh
@@ -0,0 +1,4 @@
+#include "mem/packet.hh"
+
+PacketPtr getReadPacket(Addr addr, unsigned int size);
+

From f9affcfd2e022fa2876e7f780fe0cb3b91202a80 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:36:08 -0800
Subject: [PATCH 003/279] Adding util source code.

---
 src/accl/util.cc | 28 ++++++++++++++++++++++++++++
 src/accl/util.hh | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/accl/util.cc b/src/accl/util.cc
index 20abd1c13a..8d975c482f 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -1,3 +1,31 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #include "accl/util.hh"
 
 PacketPtr
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c621b9e45c..18b8e4c197 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -1,4 +1,50 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "base/types.hh"
 #include "mem/packet.hh"
 
-PacketPtr getReadPacket(Addr addr, unsigned int size);
+struct WorkListItem
+{
+    uint32_t temp_prop;
+    uint32_t prop;
+    uint32_t degree;
+    Addr edgeList;
+}
+
+struct Edge
+{
+    uint32_t weight;
+    Addr neighbor;
+}
+
+WorkListItem& memoryToWorkList(uint8_t* data);
+Edge& memoryToEdge(uint8_t* data);
 
+PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);

From 0ec84b3c619ef3fe2ffafed50d9638b1011e7be5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 31 Jan 2022 11:34:07 -0800
Subject: [PATCH 004/279] Adding the first version of Apply engine

---
 src/accl/apply.cc | 129 ++++++++++++++++++++++++++++++++++++++++++++
 src/accl/apply.hh | 132 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 src/accl/apply.cc
 create mode 100644 src/accl/apply.hh

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
new file mode 100644
index 0000000000..d0e2b712a6
--- /dev/null
+++ b/src/accl/apply.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/apply.h"
+
+#include <string>
+
+
+typedef std::pair<PacketPtr, PortID> ReqPair;
+typedef std::pair<uint64_t, PortID> QueuePair;
+
+Apply::Apply(const ApplyParams &params):
+    ClockedObject(params),
+    nextApplyEvent([this]{processNextApplyEvent; }, name()),
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
+    queueSize(params.applyQueueSize) //add this to .py
+{
+    applyReadQueue(queueSize);
+    pplyWriteQueue(queueSize);
+}
+
+bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleWL(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool Apply::handleWL(PacketPtr pkt){
+    auto queue = applyReadQueue;
+    if (queue->blocked()){
+        sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void Apply::processNextApplyCheckEvent(){
+    auto queue = applyReadQueue;
+    memPort = ApplyMemPort
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret)
+            break;
+    }
+
+}
+
+virtual bool
+Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+bool
+Apply::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+    //check pkt (temp_prop != prop)
+    if (temp_prop != prop){
+        //update prop with temp_prop
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(pkt);
+
+        if(!nextApplyEvent.scheduled()){
+            schedule(nextApplyEvent, nextCycle());
+        }
+        return true;
+    }
+    return true;
+}
+
+
+
+void
+Apply::processNextApplyEvent(){
+    auto queue = applyWriteQueue;
+    memPort = ApplyMemPort;
+    pushPort = ApplyReqPort;
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        bool push = pushPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret || !push)
+            break;
+
+    }
+
+}
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
new file mode 100644
index 0000000000..2ae593a1cb
--- /dev/null
+++ b/src/accl/apply.hh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class Apply : public ClockedObject
+{
+  private:
+
+    class ApplyRespPort : public ResponsePort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ApplyRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class ApplyReqPort : public RequestPort
+    {
+      private:
+        APPLY *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        struct ApplyQueue{
+          std::queue<PacketPtr> applyQueue;
+          const uint_32 queueSize;
+          bool sendPktRetry;
+
+          bool blocked(){
+            return applyQueue.size() == queueSize;
+          }
+          bool empty(){
+            return applyQueue.empty();
+          }
+          void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+          }
+
+          ApplyQueue(uint32_t qSize):
+            queueSize(qSize){}
+        };
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class ApplyMemPort : public RequestPort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+
+    }
+    bool handleWL(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readApplyBuffer();
+    bool handleMemResp(PacktPtr resp);
+    void writePushBuffer();
+
+
+    //Events
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    ApplyQueue applyQueue;
+    ApplyMemPort memPort;
+   public(const ApplyParams &apply);
+};
+
+#endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 8bc3823fb859c37cb10352cea0a613d0fe3f740b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 5 Feb 2022 20:34:12 -0800
Subject: [PATCH 005/279] Portotyping memory interface

---
 src/accl/apply.cc | 36 ++++++++++++++++++++++--------------
 src/accl/apply.hh |  8 +++++---
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d0e2b712a6..b0ef5e8513 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -46,7 +46,7 @@ Apply::Apply(const ApplyParams &params):
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!owner->handleWL(pkt)){
+    if (!this->handleWL(pkt)){
         return false;
     }
     return true;
@@ -73,7 +73,9 @@ void Apply::processNextApplyCheckEvent(){
     while(!queue.empty()){
         auto pkt = queue.pop()
         /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        bool ret = memPort->sendPacket(memPkt);
         // handel responsehere
         if (!ret)
             break;
@@ -84,27 +86,24 @@ void Apply::processNextApplyCheckEvent(){
 virtual bool
 Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    return this->handleMemResp(pkt);
 }
 
 bool
 Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
-    //check pkt (temp_prop != prop)
-    if (temp_prop != prop){
-        //update prop with temp_prop
+
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(pkt);
+            queue->push(writePkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
         }
         return true;
-    }
     return true;
 }
 
@@ -117,12 +116,21 @@ Apply::processNextApplyEvent(){
     pushPort = ApplyReqPort;
     while(!queue.empty()){
         auto pkt = queue.pop()
-        /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
-        bool push = pushPort->sendPacket(pkt);
-        // handel responsehere
-        if (!ret || !push)
-            break;
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* prop = data;
+        uint32_t* temp_prop = prop + 1;
+        if (*temp_prop != *prop){
+            //update prop with temp_prop
+            *prop = min(*prop , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            bool ret = memPort->sendPacket(pkt);
+            bool push = pushPort->sendPacket(pkt);
+            // handel response here
+            if (!ret || !push)
+                break;
+        }
 
     }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 2ae593a1cb..e9c27a1fcf 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -61,7 +61,7 @@ class Apply : public ClockedObject
     class ApplyReqPort : public RequestPort
     {
       private:
-        APPLY *owner;
+        Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -124,9 +124,11 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    ApplyQueue applyQueue;
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
     ApplyMemPort memPort;
-   public(const ApplyParams &apply);
+    std::pair<Addr, int>
+   public(const ApplyParams &apply);  //fix this
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 9551d03e2bba4d76678b1771eb9c3b41db3de473 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 12:04:02 -0800
Subject: [PATCH 006/279] [wip] Improving the implementation. Adding address
 range, python params.

---
 src/accl/Apply.py |  39 ++++++++++++
 src/accl/apply.cc | 153 +++++++++++++++++++++++++++++++++++-----------
 src/accl/apply.hh |  42 ++++++++++---
 3 files changed, 191 insertions(+), 43 deletions(-)
 create mode 100644 src/accl/Apply.py

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
new file mode 100644
index 0000000000..01c627d4c8
--- /dev/null
+++ b/src/accl/Apply.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class Apply(ClockedObject):
+    type = 'Apply'
+    cxx_header = "accl/apply.hh"
+    cxx_class = 'gem5::Apply'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b0ef5e8513..d605537033 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -26,22 +26,41 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.h"
+#include "accl/apply.hh"
 
 #include <string>
 
-
-typedef std::pair<PacketPtr, PortID> ReqPair;
-typedef std::pair<uint64_t, PortID> QueuePair;
-
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
     nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
     queueSize(params.applyQueueSize) //add this to .py
 {
     applyReadQueue(queueSize);
-    pplyWriteQueue(queueSize);
+    applyWriteQueue(queueSize);
+}
+
+Port &
+Apply::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+Apply::ApplyRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
@@ -52,6 +71,65 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+
+virtual bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+Apply::ApplyMemPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyRequestPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+AddrRangeList
+Apply::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue->blocked()){
@@ -59,34 +137,29 @@ bool Apply::handleWL(PacketPtr pkt){
         return false;
     } else
         queue->push(pkt);
-
     if(!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    memPort = ApplyMemPort
     while(!queue.empty()){
-        auto pkt = queue.pop()
-        /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        bool ret = memPort->sendPacket(memPkt);
-        // handel responsehere
-        if (!ret)
-            break;
+        if(!memPort->blocked()){
+            auto pkt = queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                    respPort->trySendRetry();
+                    queue->sendPktRetry = false;
+            }
+            // conver to ReadReq
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+            memPort->sendPacket(memPkt);
+        }
+        else
+            return;
     }
-
-}
-
-virtual bool
-Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
 }
 
 bool
@@ -107,31 +180,39 @@ Apply::handleMemResp(PacktPtr pkt)
     return true;
 }
 
-
-
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    memPort = ApplyMemPort;
-    pushPort = ApplyReqPort;
     while(!queue.empty()){
-        auto pkt = queue.pop()
+        auto pkt = queue.front();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* prop = data;
         uint32_t* temp_prop = prop + 1;
         if (*temp_prop != *prop){
             //update prop with temp_prop
             *prop = min(*prop , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
-            bool ret = memPort->sendPacket(pkt);
-            bool push = pushPort->sendPacket(pkt);
-            // handel response here
-            if (!ret || !push)
+            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
+                memPort->sendPacket(pkt);
+                applyReqPort->sendPacket(pkt);
+                queue.pop();
+                if(queue->sendPktRetry && !queue->blocked()){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
                 break;
         }
-
+        else{
+            queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+        }
     }
-
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e9c27a1fcf..fab4cf871a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -56,6 +56,7 @@ class Apply : public ClockedObject
 
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        void trySendRetry();
     }
 
     class ApplyReqPort : public RequestPort
@@ -64,7 +65,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-
         struct ApplyQueue{
           std::queue<PacketPtr> applyQueue;
           const uint_32 queueSize;
@@ -83,12 +83,19 @@ class Apply : public ClockedObject
           ApplyQueue(uint32_t qSize):
             queueSize(qSize){}
         };
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
+        void sendPacket(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
 
+      protected:
+        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class ApplyMemPort : public RequestPort
     {
@@ -96,13 +103,21 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
+    };
 
-    }
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -110,7 +125,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacktPtr resp);
     void writePushBuffer();
 
-
     //Events
     void processNextApplyCheckEvent();
     /* Syncronously checked
@@ -124,11 +138,25 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
+    void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+
+    void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
+
+    AddrRangeList getAddrRanges() const;
+
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
+
     ApplyMemPort memPort;
-    std::pair<Addr, int>
-   public(const ApplyParams &apply);  //fix this
+    ApplyRespPort respPort;
+    ApplyRequestPort reqPort;
+
+  public:
+    Apply(const ApplyParams &apply);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From f2825cce4b64277c084d816bf0b6a5eb3e71d95a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:14:27 -0800
Subject: [PATCH 007/279] [wip] minor fixes to Apply engine

---
 src/accl/apply.cc |  8 ++++----
 src/accl/apply.hh | 44 +++++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d605537033..6ad630f0ac 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -96,7 +96,7 @@ WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 void
 Apply::ApplyMemPort::trySendRetry()
 {
-    sendRetryReq();
+    sendRetryResp();
 }
 
 void
@@ -108,7 +108,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -117,7 +117,7 @@ WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyRequestPort::recvReqRetry()
+Apply::ApplyReqtPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -158,7 +158,7 @@ void Apply::processNextApplyCheckEvent(){
             memPort->sendPacket(memPkt);
         }
         else
-            return;
+            break;
     }
 }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index fab4cf871a..dae3d8ec0e 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -43,11 +43,29 @@ class Apply : public ClockedObject
 {
   private:
 
+    struct ApplyQueue{
+      std::queue<PacketPtr> applyQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return applyQueue.size() == queueSize;
+      }
+      bool empty(){
+        return applyQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        applyQueue.push(pkt);
+      }
+
+      ApplyQueue(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -55,9 +73,11 @@ class Apply : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
         void trySendRetry();
-    }
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+    };
 
     class ApplyReqPort : public RequestPort
     {
@@ -65,24 +85,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-        struct ApplyQueue{
-          std::queue<PacketPtr> applyQueue;
-          const uint_32 queueSize;
-          bool sendPktRetry;
-
-          bool blocked(){
-            return applyQueue.size() == queueSize;
-          }
-          bool empty(){
-            return applyQueue.empty();
-          }
-          void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-          }
-
-          ApplyQueue(uint32_t qSize):
-            queueSize(qSize){}
-        };
 
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,

From 0a8479c122c2a8647fc3ccabcd7f2ecd93419c1a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 7 Feb 2022 12:26:01 -0800
Subject: [PATCH 008/279] Worklist engine implementation

---
 src/accl/wl_engine.cc | 185 ++++++++++++++++++++++++++++++++++++++++++
 src/accl/wl_engine.hh | 143 ++++++++++++++++++++++++++++++++
 2 files changed, 328 insertions(+)
 create mode 100644 src/accl/wl_engine.cc
 create mode 100644 src/accl/wl_engine.hh

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
new file mode 100644
index 0000000000..28f8a4fe11
--- /dev/null
+++ b/src/accl/wl_engine.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/wl_engine.hh"
+
+#include <string>
+
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    ClockedObject(params),
+    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
+    queueSize(params.wlQueueSize) //add this to .py
+{
+    wlReadQueue(queueSize);
+    wlWriteQueue(queueSize);
+}
+
+bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!this->handleWLUpdate(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool WLEngine::handleWLUpdate(PacketPtr pkt){
+    auto queue = wlReadQueue;
+    if (queue->blocked()){
+        queue->sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void WLEngine::processNextWLReadEvent(){
+    auto queue = wlReadQueue;
+    memPort = WLMemPort
+    while(!queue.empty()){ //create a map instead of front
+        auto pkt = queue.front()
+        /// conver to ReadReq
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        if (!memPort->blocked()){
+            memPort->sendPacket(memPkt);
+            break;
+        }
+    }
+
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+
+    owner->wakeUp(); //TODO
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+bool
+WLEngine::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(writePkt);
+
+        if(!nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+        }
+        return true;
+    return true;
+}
+
+void
+WLEngine::processNextWLReduceEvent(){
+    auto queue = wlWriteQueue;
+    auto updateQ = wlReadQueue;
+    memPort = WLMemPort;
+    applyPort = WLReqPort;
+    while(!queue.empty()){
+        auto update = updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            WLRespPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
+        auto pkt = queue.front()
+        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* value = updatePtr;
+        uint32_t* temp_prop = prop + 1;
+        if (*value != *prop){
+            //update prop with temp_prop
+            *temp_prop = min(*value , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            if (!memPort->blocked() && !applyPort->blocked()){
+                memPort->sendPacket(pkt);
+                applyPort->sendPacket(pkt);
+                queue.pop();
+                if (!queue->blocked() && queue->sendPktRetry){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
+                break;
+        }
+        else{
+            queue.pop();
+            if (!queue->blocked() && queue->sendPktRetry){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+
+        }
+
+    }
+
+}
+
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
new file mode 100644
index 0000000000..7269965ff2
--- /dev/null
+++ b/src/accl/wl_engine.hh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLE_HH__
+#define __ACCL_WLE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class WLEngine : public ClockedObject
+{
+  private:
+
+    struct WLQueue{
+      std::queue<PacketPtr> wlQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return wlQueue.size() == queueSize;
+      }
+      bool empty(){
+        return wlQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        wlQueue.push(pkt);
+      }
+
+      WLReqPort(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
+    class WLRespPort : public ResponsePort //From Push engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        WLRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLReqPort : public RequestPort //To Apply Engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void trySendRetry();
+        virtual bool recvTimingResp(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLMemPort : public RequestPort
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+    }
+    bool handleWLU(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readWLBuffer();
+    bool handleMemResp(PacktPtr resp);
+
+
+    //Events
+    void processNextWLReadEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextWLReduceEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    WLQueue wlReadQueue;
+    WLQueue wlWriteQueue;
+    WLMemPort memPort;
+    std::pair<Addr, int>
+   public:
+   WLEngine(const WLEngineParams &params);  //fix this
+};
+
+#endif // __ACCL_WLE_HH__
\ No newline at end of file

From d4c81ddf599c33978ca0c13e17bf34c498a2262d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:06:32 -0800
Subject: [PATCH 009/279] [wip] Adding the python file to the WLE

---
 src/accl/WLEngine.py  |  39 ++++++++++++
 src/accl/wl_engine.cc | 138 ++++++++++++++++++++++++++++--------------
 src/accl/wl_engine.hh |  46 ++++++++++----
 3 files changed, 165 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/WLEngine.py

diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
new file mode 100644
index 0000000000..fe6b25b6ba
--- /dev/null
+++ b/src/accl/WLEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class WLEngine(ClockedObject):
+    type = 'WLEngine'
+    cxx_header = "accl/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    respPort = ResponsePort("Receives updates")
+    reqPort  = RequestPort("Sends requests to Apply")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 28f8a4fe11..fbf201720d 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,9 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
@@ -41,6 +44,26 @@ WLEngine::WLEngine(const WLEngineParams &params):
     wlWriteQueue(queueSize);
 }
 
+Port &
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+WLEngine::WLRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
     if (!this->handleWLUpdate(pkt)){
@@ -49,6 +72,68 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
+void
+WLEngine::WLReqPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+AddrRangeList
+WLEngine::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = wlReadQueue;
     if (queue->blocked()){
@@ -63,14 +148,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-
 void WLEngine::processNextWLReadEvent(){
     auto queue = wlReadQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        RequestPtr req =
+            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         if (!memPort->blocked()){
             memPort->sendPacket(memPkt);
@@ -80,37 +165,10 @@ void WLEngine::processNextWLReadEvent(){
 
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-WLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-
-    owner->wakeUp(); //TODO
-}
-
-virtual bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
-}
-
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = applyWriteQueue;
+    auto queue = wlWriteQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -128,12 +186,11 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = wlWriteQueue;
     auto updateQ = wlReadQueue;
-    memPort = WLMemPort;
-    applyPort = WLReqPort;
+    applyPort = reqPort;
     while(!queue.empty()){
         auto update = updateQ.pop()
         if (!updateQ->blocked() & updateQ->sendPktRetry){
-            WLRespPort->trySendRetry();
+            respPort->trySendRetry();
             updateQ->sendPktRetry = false;
         }
         auto pkt = queue.front()
@@ -144,7 +201,8 @@ WLEngine::processNextWLReduceEvent(){
         if (*value != *prop){
             //update prop with temp_prop
             *temp_prop = min(*value , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
             if (!memPort->blocked() && !applyPort->blocked()){
@@ -171,15 +229,3 @@ WLEngine::processNextWLReduceEvent(){
     }
 
 }
-
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7269965ff2..3f39ec7ee8 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -66,7 +66,6 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -74,11 +73,11 @@ class WLEngine : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
+        void trySendRetry();
+
+      protected:
         virtual bool recvTimingReq(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
-    }
+    };
 
     class WLReqPort : public RequestPort //To Apply Engine
     {
@@ -86,15 +85,19 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void trySendRetry();
-        virtual bool recvTimingResp(PacketPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked(){
           return _blocked;
         }
-    }
+
+      protected:
+        void recvReqRetry() override;
+        virtual bool recvTimingResp(PacketPtr pkt);
+    };
 
     class WLMemPort : public RequestPort
     {
@@ -102,16 +105,21 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
         void sendPacket(PacktPtr pkt);
-        virtual bool recvTimingResp(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
         }
-    }
+
+    protected:
+      virtual bool recvTimingResp(PacketPtr pkt);
+      void recvReqRetry() override;
+    };
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -131,13 +139,27 @@ class WLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+    void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
+
+    void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
+
+    AddrRangeList getAddrRanges() const;
 
     WLQueue wlReadQueue;
     WLQueue wlWriteQueue;
     WLMemPort memPort;
-    std::pair<Addr, int>
+
+    WLMemPort memPort;
+    WLRespPort respPort;
+    WLRequestPort reqPort;
+
    public:
-   WLEngine(const WLEngineParams &params);  //fix this
+
+    WLEngine(const WLEngineParams &params);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_WLE_HH__
\ No newline at end of file

From a98a74365e2f7d038c4c298d3b30122df454c638 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:06:45 -0800
Subject: [PATCH 010/279] Changing some small errors

---
 src/accl/wl_engine.cc | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index fbf201720d..e49ad44bf1 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -162,7 +162,6 @@ void WLEngine::processNextWLReadEvent(){
             break;
         }
     }
-
 }
 
 bool
@@ -188,12 +187,8 @@ WLEngine::processNextWLReduceEvent(){
     auto updateQ = wlReadQueue;
     applyPort = reqPort;
     while(!queue.empty()){
-        auto update = updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
-        }
-        auto pkt = queue.front()
+        auto update = updateQ.front();
+        auto pkt = queue.front();
         uint64_t* updatePtr = pkt->getPtr<uint64_t>();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* value = updatePtr;
@@ -213,6 +208,11 @@ WLEngine::processNextWLReduceEvent(){
                     memPort->trySendRetry();
                     queue->sendPktRetry = false;
                 }
+                updateQ.pop();
+                if (!updateQ->blocked() & updateQ->sendPktRetry){
+                    respPort->trySendRetry();
+                    updateQ->sendPktRetry = false;
+                }
             }
             else
                 break;
@@ -223,6 +223,11 @@ WLEngine::processNextWLReduceEvent(){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
+            updateQ.pop()
+            if (!updateQ->blocked() & updateQ->sendPktRetry){
+                respPort->trySendRetry();
+                updateQ->sendPktRetry = false;
+            }
 
         }
 

From 17515d0b6ec624dbc49e28231c63a0e8284779d7 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 17:39:58 -0800
Subject: [PATCH 011/279] [wip] using util in the creating memory  packets

---
 src/accl/apply.cc     | 69 ++++++++++++++++++------------
 src/accl/apply.hh     |  6 +++
 src/accl/util.cc      | 43 +++++++++++++++++++
 src/accl/util.hh      |  3 +-
 src/accl/wl_engine.cc | 97 ++++++++++++++++++++++++-------------------
 src/accl/wl_engine.hh | 10 ++++-
 6 files changed, 155 insertions(+), 73 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6ad630f0ac..6b474d5628 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -32,6 +32,8 @@
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -145,20 +147,25 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    while(!queue.empty()){
-        if(!memPort->blocked()){
-            auto pkt = queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                    respPort->trySendRetry();
-                    queue->sendPktRetry = false;
-            }
-            // conver to ReadReq
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-            memPort->sendPacket(memPkt);
+    if(!memPort->blocked()){
+        auto pkt = queue.pop();
+        if(queue->sendPktRetry && !queue->blocked()){
+                respPort->trySendRetry();
+                queue->sendPktRetry = false;
         }
-        else
-            break;
+        // conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
+        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
+        memPort->sendPacket(memPkt);
+    }
+    else{
+        break;
+    }
+    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
@@ -183,21 +190,27 @@ Apply::handleMemResp(PacktPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    while(!queue.empty()){
         auto pkt = queue.front();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* prop = data;
-        uint32_t* temp_prop = prop + 1;
-        if (*temp_prop != *prop){
-            //update prop with temp_prop
-            *prop = min(*prop , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
-                memPort->sendPacket(pkt);
-                applyReqPort->sendPacket(pkt);
+        uint8_t* data = pkt->getPtr<uint8_t>();
+
+        RequestPtr req = pkt->req;
+        int request_offset = requestOffset[req];
+        WorkListItem wl = memoryToWorkList(data + request_offset);
+        uint32_t prop = wl.prop;
+        uint32_t temp_prop = wl.temp_prop;
+
+        if (temp_prop != prop){
+            if (!memPort->blocked() && !reqPort->blocked()){
+                //update prop with temp_prop
+                wl.prop = min(prop , temp_prop);
+                //write back the new worklist item to  memory
+                uint8_t* wList = workListToMemory(wl);
+                memcpy(data + request_offset, wList, sizeof(WorkListItem));
+                //Create memory write requests.
+                PacketPtr writePkt  =
+                getWritePacket(pkt->getAddr(), 64, data, requestorId);
+                memPort->sendPacket(writePkt);
+                applyReqPort->sendPacket(writePkt);
                 queue.pop();
                 if(queue->sendPktRetry && !queue->blocked()){
                     memPort->trySendRetry();
@@ -214,5 +227,7 @@ Apply::processNextApplyEvent(){
                 queue->sendPktRetry = false;
             }
         }
+    if(!queue.empty() && !nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
     }
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index dae3d8ec0e..b213d37667 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -146,6 +147,9 @@ class Apply : public ClockedObject
     void processNextApplyCheckEvent();
     EventFunctionWrapper nextApplyCheckEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+
     AddrRangeList getAddrRanges() const;
 
     ApplyQueue applyReadQueue;
@@ -155,6 +159,8 @@ class Apply : public ClockedObject
     ApplyRespPort respPort;
     ApplyRequestPort reqPort;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
+
   public:
     Apply(const ApplyParams &apply);
     Port &getPort(const std::string &if_name,
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8d975c482f..8debd3a937 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -42,3 +42,46 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 
     return pkt;
 }
+
+PacketPtr getWritePacket(Addr addr,
+               unsigned int size,
+               uint8_t* data,
+               RequestorID requestorId)
+{
+    equestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+unit8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
+    uint_8* data = new uint8_t [data_size];
+    uint_32* wList = (uint_32*)data;
+    *wList = wl.prop;
+    *wList + 1 = wl.temp_prop;
+    *wList + 2 = wl.degree;
+    *wList + 3 = wl.edgeIndex;
+
+    return data;
+}
\ No newline at end of file
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 18b8e4c197..00ccb7ddd9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -34,7 +34,7 @@ struct WorkListItem
     uint32_t temp_prop;
     uint32_t prop;
     uint32_t degree;
-    Addr edgeList;
+    uint32_t edgeIndex;
 }
 
 struct Edge
@@ -44,6 +44,7 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
+unit8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index e49ad44bf1..7d6d707ae6 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,8 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -40,8 +42,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
 {
-    wlReadQueue(queueSize);
-    wlWriteQueue(queueSize);
+    updateQueue(queueSize);
+    responseQueue(queueSize);
 }
 
 Port &
@@ -135,7 +137,7 @@ WLEngine::getAddrRanges() const
 }
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     if (queue->blocked()){
         queue->sendPktRetry = true;
         return false;
@@ -149,25 +151,32 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 }
 
 void WLEngine::processNextWLReadEvent(){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
         RequestPtr req =
-            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
         if (!memPort->blocked()){
+            queue.pop()
             memPort->sendPacket(memPkt);
             break;
         }
     }
+    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
 }
 
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = wlWriteQueue;
+    auto queue = responseQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -183,54 +192,56 @@ WLEngine::handleMemResp(PacktPtr pkt)
 
 void
 WLEngine::processNextWLReduceEvent(){
-    auto queue = wlWriteQueue;
-    auto updateQ = wlReadQueue;
+    auto queue = responseQueue;
+    auto updateQ = updateQueue;
     applyPort = reqPort;
-    while(!queue.empty()){
-        auto update = updateQ.front();
-        auto pkt = queue.front();
-        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* value = updatePtr;
-        uint32_t* temp_prop = prop + 1;
-        if (*value != *prop){
-            //update prop with temp_prop
-            *temp_prop = min(*value , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !applyPort->blocked()){
-                memPort->sendPacket(pkt);
-                applyPort->sendPacket(pkt);
-                queue.pop();
-                if (!queue->blocked() && queue->sendPktRetry){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
-                }
-                updateQ.pop();
-                if (!updateQ->blocked() & updateQ->sendPktRetry){
-                    respPort->trySendRetry();
-                    updateQ->sendPktRetry = false;
-                }
-            }
-            else
-                break;
-        }
-        else{
+    auto update = updateQ.front();
+    auto value = update->getPtr<uint8_t>();
+    auto pkt = queue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
+    RequestPtr req = pkt->req;
+    int request_offset = requestOffset[req];
+    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    uint32_t temp_prop = wl.temp_prop;
+    if (temp_prop != *value){
+        //update prop with temp_prop
+        temp_prop = min(value , temp_prop);
+        if (!memPort->blocked() && !applyPort->blocked()){
+            wl.temp_prop = temp_prop;
+            unit8_t* wlItem = workListToMemory(wl);
+            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            memPort->sendPacket(writePkt);
+            applyPort->sendPacket(writePkt);
             queue.pop();
             if (!queue->blocked() && queue->sendPktRetry){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
-            updateQ.pop()
+            updateQ.pop();
             if (!updateQ->blocked() & updateQ->sendPktRetry){
                 respPort->trySendRetry();
                 updateQ->sendPktRetry = false;
             }
-
         }
-
+        else
+            break;
     }
+    else{
+        queue.pop();
+        if (!queue->blocked() && queue->sendPktRetry){
+            memPort->trySendRetry();
+            queue->sendPktRetry = false;
+        }
+        updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            respPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
 
+    }
+    if(!queue && !nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+    }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 3f39ec7ee8..7132283463 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -39,6 +40,7 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -145,10 +147,14 @@ class WLEngine : public ClockedObject
     void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     AddrRangeList getAddrRanges() const;
 
-    WLQueue wlReadQueue;
-    WLQueue wlWriteQueue;
+    WLQueue updateQueue;
+    WLQueue responseQueue;
     WLMemPort memPort;
 
     WLMemPort memPort;

From f023fed96d5435117da823fac36e21f15b0a6b84 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 21:15:35 -0800
Subject: [PATCH 012/279] Completing PushEngine.

---
 src/accl/push_engine.cc | 174 ++++++++++++++++++++++++++++++----------
 src/accl/push_engine.hh |  24 ++++--
 src/accl/util.cc        |  43 +++++++++-
 src/accl/util.hh        |   6 +-
 src/mem/packet.hh       |   2 +
 5 files changed, 196 insertions(+), 53 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bc3138f61e..cd5f73eea3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,26 +26,25 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "accl/util.hh"
 #include "accl/push_engine.hh"
-
 #include "debug/PushEngine.hh"
 
-PushEngine::PushEngine(const PushEngineParams& params):
-    ClockedObject(params),
+PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    vertexQueueSize(params.vertex_queue_size),
-    vertexQueueLen(0),
-    updateQueue(params.update_queue_size),
-    updateQueueLen(0),
-    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
-    nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
-    nextSendEvent([this]{ processNextSendEvent(); }, name())
-{}
+    // vertexQueueSize(params.vertex_queue_size),
+    // vertexQueueLen(0),
+    // updateQueue(params.update_queue_size),
+    // updateQueueLen(0),
+    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextSendEvent([this] { processNextSendEvent(); }, name())
+{
+}
 
 Port &
 PushEngine::getPort(const std::string &if_name, PortID idx)
@@ -61,60 +60,151 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-bool
-PushEngine::handleUpdate(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
 {
-    if (vertexQueueLen < vertexQueueSize) {
-        vertexQueue.push(pkt)
-        vertexQueueLen++;
-        return true;
+    owner->memPort->getAddrRanges();
+}
 
-        if (!nextReceiveEvent.scheduled()){
-            schedule(nextReceiveEvent, nextCycle());
-        }
+bool PushEngine::handleUpdate(PacketPtr pkt)
+{
+    // if (vertexQueueLen < vertexQueueSize) {
+    //     vertexQueue.push(pkt)
+    //         vertexQueueLen++;
+    //     if (!nextReceiveEvent.scheduled()) {
+    //         schedule(nextReceiveEvent, nextCycle());
+    //     }
+    //     return true;
+    // }
+    // return false;
+    vertexQueue.push(pkt)
+    if (!nextReceiveEvent.scheduled()) {
+        schedule(nextReceiveEvent, nextCycle());
     }
-    return false;
+    return true;
 }
 
-void
-PushEngine::processNextReceiveEvent()
+void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t* data = updatePkt->getData<uint8_t>();
-
-    Addr edgeListAddr = ; // TODO: Generalize finding this address.
-    int outDegree = ; // TODO: Generalize finding this value.
-
-    Addr reqAddr = (edgeListAddr / 64) * 64;
-    Addr offsetAddr = edgeListAddr % 64;
+    uint8_t *data = updatePkt->getData<uint8_t>();
+
+    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
+    uint32_t edge_index = *((uint32_t *)data);
+    uint32_t degree = *((uint32_t *)(data + 4));
+    uint32_t value = *((uint32_t *)(data + 8));
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < degree; index++) {
+        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push(req_addr);
+                offset_queue.push(req_offset);
+                num_edge_queue.push(1);
+            }
+        }
+        else {
+            addr_queue.push(req_addr);
+            offset_queue.push(req_offset);
+            num_edge_queue.push(1);
+        }
+    }
 
-    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+    for (int index = 0; index < addr_queue.size(); inedx++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        memReqQueue.push(pkt);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = value;
+    }
 
-    memPort.sendPacket(pkt);
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
 
+void PushEngine::processNextReadEvent()
+{
+    PacketPtr pkt = memReqQueue.front();
+    if (!memPort.blocked()) {
+        memPort.sendPacket(pkt);
+        memReqQueue.pop();
+    }
 
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextReadEvent()
+bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
 {
+    return owner->handleMemResp(pkt);
+}
 
+void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
+                this->name());
+        _blocked = true;
+    }
 }
 
-void
-PushEngine::processNextCreateEvent()
+void PushEngine::handleMemResp(PacketPtr pkt)
 {
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        Edge e = memoryToEdge(curr_edge_data);
+        uint32_t *update_data = new uint32_t;
+
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+        updateQueue.push(update);
+    }
 
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextSendEvent()
+
+void PushEngine::processNextSendEvent()
 {
+    PacketPtr pkt = updateQueue.front();
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        updateQueue.pop();
+    }
 
-}
\ No newline at end of file
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 6ab902d0e2..a746dcc265 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -51,6 +51,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        //TODO: Implement this;
         PushRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -65,6 +66,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -78,9 +80,12 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+
+        void sendPacket(PacktPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
@@ -93,12 +98,18 @@ class PushEngine : public ClockedObject
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
-    int vertexQueueSize;
-    int vertexQueueLen;
+    // int vertexQueueSize;
+    // int vertexQueueLen;
+
+    std::unordered_map<req, Addr> reqOffsetMap;
+    std::unordered_map<req, int> reqNumEdgeMap;
+    std::unordered_map<req, uint32_t> reqValueMap;
+
+    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
     std::queue<PacketPtr> updateQueue;
-    int updateQueueSize;
-    int updateQueueLen;
+    // int updateQueueSize;
+    // int updateQueueLen;
 
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -106,9 +117,6 @@ class PushEngine : public ClockedObject
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextCreateEvent;
-    void processNextCreateEvent();
-
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8debd3a937..76ed6269c2 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,34 @@
 
 #include "accl/util.hh"
 
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+Edge&
+memoryToEdge(uint8_t *data)
+{
+    uint64_t weight = *((uint64_t*) data);
+    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
+    Edge e = {weight, neighbor};
+    return e;
+}
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+uint8_t*
+edgeToMemory(Edge e)
+{
+    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
+
+    uint8_t* data = new uint8_t [data_size];
+
+    uint64_t* weightPtr = (uint64_t*) data;
+    *weightPtr = e.weight;
+
+    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
+    *neighborPtr = e.neighbor;
+
+    return data;
+}
+
 PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
@@ -43,6 +71,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
+
 PacketPtr getWritePacket(Addr addr,
                unsigned int size,
                uint8_t* data,
@@ -53,6 +82,18 @@ PacketPtr getWritePacket(Addr addr,
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
     pkt->allocate();
     pkt->setData(data);
 
@@ -84,4 +125,4 @@ workListToMemory(WorkListItem wl){
     *wList + 3 = wl.edgeIndex;
 
     return data;
-}
\ No newline at end of file
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 00ccb7ddd9..c309d4967a 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -39,13 +39,15 @@ struct WorkListItem
 
 struct Edge
 {
-    uint32_t weight;
+    uint64_t weight;
     Addr neighbor;
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
 unit8_t* workListToMemory(WorkListItem wl);
+
 Edge& memoryToEdge(uint8_t* data);
+uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index ed7a94f4fb..69686e7835 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,6 +149,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 

From 04be84a6ba93dbf102b05f1a36f9743abb22a804 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:20:19 -0800
Subject: [PATCH 013/279] arch: Accelerator

[wip] Adding Sconscript, debugging

Change-Id: I0cef6e8745ca8f58a17a01d71dfb090fe1a7e606
---
 src/accl/PushEngine.py  | 39 ++++++++++++++++++++++
 src/accl/SConscript     | 36 ++++++++++++++++++++
 src/accl/apply.cc       | 74 +++++++++++++++++++----------------------
 src/accl/apply.hh       | 24 +++++++++----
 src/accl/push_engine.cc |  2 +-
 src/accl/util.cc        |  2 ++
 src/accl/util.hh        |  7 ++--
 src/accl/wl_engine.cc   | 71 +++++++++++++++++++--------------------
 src/accl/wl_engine.hh   | 20 +++++++----
 9 files changed, 180 insertions(+), 95 deletions(-)
 create mode 100644 src/accl/PushEngine.py
 create mode 100644 src/accl/SConscript

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
new file mode 100644
index 0000000000..37639377c1
--- /dev/null
+++ b/src/accl/PushEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class PushEngine(ClockedObject):
+    type = 'PushEngine'
+    cxx_header = "accl/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/SConscript b/src/accl/SConscript
new file mode 100644
index 0000000000..da0774ca44
--- /dev/null
+++ b/src/accl/SConscript
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('WLEngine.py')
+# SimObject('Apply.py')
+# SimObject('PushEngine.py')
+
+# Source('apply.cc')
+Source('wl_engine.cc')
+# Source('push_engine.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6b474d5628..985e6217d7 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -38,11 +38,10 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
-    queueSize(params.applyQueueSize) //add this to .py
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
 {
-    applyReadQueue(queueSize);
-    applyWriteQueue(queueSize);
+    applyReadQueue(params.applyQueueSize);
+    applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -110,7 +109,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -119,7 +118,7 @@ WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyReqtPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -134,12 +133,13 @@ Apply::getAddrRanges() const
 
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
-    if (queue->blocked()){
+    if (queue.blocked()){
         sendPktRetry = true;
         return false;
-    } else
-        queue->push(pkt);
-    if(!nextApplyCheckEvent.scheduled()){
+    } else{
+        queue.push(pkt);
+    }
+    if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
@@ -147,22 +147,19 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if(!memPort->blocked()){
+    if (!memPort.blocked()){
         auto pkt = queue.pop();
-        if(queue->sendPktRetry && !queue->blocked()){
-                respPort->trySendRetry();
-                queue->sendPktRetry = false;
+        if (queue.sendPktRetry && !queue.blocked()){
+                respPort.trySendRetry();
+                queue.sendPktRetry = false;
         }
         // conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        memPort->sendPacket(memPkt);
-    }
-    else{
-        break;
+        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+        requestOffset[request] = req_offset;
+        memPort.sendPacket(memPkt);
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -174,11 +171,11 @@ Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
 
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(pkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
@@ -193,41 +190,38 @@ Apply::processNextApplyEvent(){
         auto pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr req = pkt->req;
-        int request_offset = requestOffset[req];
+        RequestPtr request = pkt->req;
+        int request_offset = requestOffset[request];
         WorkListItem wl = memoryToWorkList(data + request_offset);
         uint32_t prop = wl.prop;
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort->blocked() && !reqPort->blocked()){
+            if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = min(prop , temp_prop);
+                wl.prop = std::min(prop , temp_prop);
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
                 //Create memory write requests.
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort->sendPacket(writePkt);
-                applyReqPort->sendPacket(writePkt);
+                memPort.sendPacket(writePkt);
+                applyReqPort.sendPacket(writePkt);
                 queue.pop();
-                if(queue->sendPktRetry && !queue->blocked()){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
+                if (queue.sendPktRetry && !queue.blocked()){
+                    memPort.trySendRetry();
+                    queue.sendPktRetry = false;
                 }
             }
-            else
-                break;
-        }
-        else{
+        }else{
             queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (queue.sendPktRetry && !queue.blocked()){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
         }
     if(!queue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
-}
\ No newline at end of file
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index b213d37667..f4dabd6a97 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -35,10 +35,12 @@
 #include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
-#include "mem/port.hh"
+#include "base/types.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "mem/port.hh"
+#include "params/Apply.hh"
 #include "sim/clocked_object.hh"
+#include "sim/port.hh"
 
 class Apply : public ClockedObject
 {
@@ -46,17 +48,25 @@ class Apply : public ClockedObject
 
     struct ApplyQueue{
       std::queue<PacketPtr> applyQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return applyQueue.size() == queueSize;
+        return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue.empty();
+        return applyQueue->empty();
       }
       void push(PacketPtr pkt){
-        applyQueue.push(pkt);
+        applyQueue->push(pkt);
+      }
+
+      void pop(){
+        applyQueue->pop();
+      }
+
+      void front(){
+        applyQueue->front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -167,4 +177,4 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_APPLY_HH__
\ No newline at end of file
+#endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index cd5f73eea3..c02009d25a 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-#include "debug/PushEngine.hh"
+// #include "debug/PushEngine.hh"
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 76ed6269c2..92f6a3e351 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,8 @@
 
 #include "accl/util.hh"
 
+#include "base/types.hh"
+#include "mem/packet.hh"
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c309d4967a..737d52e2a1 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -35,7 +36,7 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
-}
+};
 
 struct Edge
 {
@@ -44,10 +45,10 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
-unit8_t* workListToMemory(WorkListItem wl);
+uint8_t* workListToMemory(WorkListItem wl);
 
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7d6d707ae6..757bdd2598 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -39,11 +39,10 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
-    queueSize(params.wlQueueSize) //add this to .py
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(queueSize);
-    responseQueue(queueSize);
+    updateQueue(params.wlQueueSize);
+    responseQueue(params.wlQueueSize);
 }
 
 Port &
@@ -138,11 +137,11 @@ WLEngine::getAddrRanges() const
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
-    if (queue->blocked()){
-        queue->sendPktRetry = true;
+    if (queue.blocked()){
+        queue.sendPktRetry = true;
         return false;
     } else
-        queue->push(pkt);
+        queue.push(pkt);
 
     if(!nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -152,19 +151,19 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    memPort = WLMemPort
-    while(!queue.empty()){ //create a map instead of front
+    auto memPort = WLMemPort;
+    while (!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req =
+        RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        if (!memPort->blocked()){
+        requestOffset[request] = req_offset;
+        if (!memPort.blocked()){
             queue.pop()
-            memPort->sendPacket(memPkt);
+            memPort.sendPacket(memPkt);
             break;
         }
     }
@@ -177,11 +176,11 @@ bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
     auto queue = responseQueue;
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(writePkt);
 
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
@@ -199,49 +198,47 @@ WLEngine::processNextWLReduceEvent(){
     auto value = update->getPtr<uint8_t>();
     auto pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr req = pkt->req;
-    int request_offset = requestOffset[req];
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
     WorkListItem wl =  memoryToWorkList(data + request_offset)
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = min(value , temp_prop);
-        if (!memPort->blocked() && !applyPort->blocked()){
+        temp_prop = std::min(value , temp_prop);
+        if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
-            unit8_t* wlItem = workListToMemory(wl);
+            uint8_t* wlItem = workListToMemory(wl);
             memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort->sendPacket(writePkt);
-            applyPort->sendPacket(writePkt);
+            memPort.sendPacket(writePkt);
+            applyPort.sendPacket(writePkt);
             queue.pop();
-            if (!queue->blocked() && queue->sendPktRetry){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (!queue.blocked() && queue.sendPktRetry){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
             updateQ.pop();
-            if (!updateQ->blocked() & updateQ->sendPktRetry){
-                respPort->trySendRetry();
-                updateQ->sendPktRetry = false;
+            if (!updateQ.blocked() & updateQ.sendPktRetry){
+                respPort.trySendRetry();
+                updateQ.sendPktRetry = false;
             }
         }
-        else
-            break;
     }
     else{
         queue.pop();
-        if (!queue->blocked() && queue->sendPktRetry){
-            memPort->trySendRetry();
-            queue->sendPktRetry = false;
+        if (!queue.blocked() && queue.sendPktRetry){
+            memPort.trySendRetry();
+            queue.sendPktRetry = false;
         }
         updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
+        if (!updateQ.blocked() & updateQ.sendPktRetry){
+            respPort.trySendRetry();
+            updateQ.sendPktRetry = false;
         }
 
     }
-    if(!queue && !nextWLReduceEvent.scheduled()){
+    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7132283463..0393cd4cb5 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -37,9 +37,9 @@
 #include "base/statistics.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
-
+#include "sim/port.hh"
 
 class WLEngine : public ClockedObject
 {
@@ -47,20 +47,26 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return wlQueue.size() == queueSize;
+        return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue.empty();
+        return wlQueue->empty();
       }
       void push(PacketPtr pkt){
-        wlQueue.push(pkt);
+        wlQueue->push(pkt);
+      }
+      void pop(){
+        wlQueue->pop();
+      }
+      void front(){
+        wlQueue.front());
       }
 
-      WLReqPort(uint32_t qSize):
+      WLQueue(uint32_t qSize):
         queueSize(qSize){}
     };
 

From ab7fc2e540018d791c66b89bf46587fa2e4aeba9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:02:36 -0800
Subject: [PATCH 014/279] Addin simobject file and startup for PushEngine.

---
 src/accl/PushEngine.py  | 11 ++++++-----
 src/accl/push_engine.cc | 37 ++++++++++++++++++++++++++++++++++++-
 src/accl/push_engine.hh |  3 +++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 37639377c1..3215fdaee2 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -26,14 +26,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'PushEngine'
+    type = 'WLEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
+    system = Param.System(Parent.any, "The system object this push engine is a part of")
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index c02009d25a..f1f8f7698b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -60,6 +60,40 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+}
+
 bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
@@ -104,7 +138,8 @@ void PushEngine::processNextReceiveEvent()
     std::vector<int> num_edge_queue;
 
     for (uint32_t index = 0; index < degree; index++) {
-        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        // FIXME: For now the base edge address is 1048576
+        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index a746dcc265..077c61aa2b 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,6 +39,7 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
+//FIXME: Add gem5 namespace here
 class PushEngine : public ClockedObject
 {
   private:
@@ -89,6 +90,8 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
+    virtual void startup() override;
+
     System* const system;
     const RequestorID requestorId;
 

From 154de1bb6e5b8a87a2725bdaadebbbe0dd53a3fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 12:26:30 -0800
Subject: [PATCH 015/279] Bug fixes.

---
 src/accl/SConscript |  8 ++---
 src/accl/util.cc    | 82 +++++++++++++++++++++++++--------------------
 src/accl/util.hh    |  7 ++--
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index da0774ca44..4b78ff9e80 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -28,9 +28,9 @@
 Import('*')
 
 SimObject('WLEngine.py')
-# SimObject('Apply.py')
-# SimObject('PushEngine.py')
+SimObject('Apply.py')
+SimObject('PushEngine.py')
 
-# Source('apply.cc')
+Source('apply.cc')
 Source('wl_engine.cc')
-# Source('push_engine.cc')
+Source('push_engine.cc')
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 92f6a3e351..b81ba4db7d 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,8 +28,39 @@
 
 #include "accl/util.hh"
 
-#include "base/types.hh"
-#include "mem/packet.hh"
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+uint8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
+    uint8_t* data = new uint8_t [data_size];
+
+    uint32_t* tempPtr = (uint32_t*) data;
+    *tempPtr = wl.temp_prop;
+
+    uint32_t* propPtr = (uint32_t*) (data + 4);
+    *propPtr = wl.prop;
+
+    uint32_t* degreePtr = (uint32_t*) (data + 8);
+    *degreePtr = wl.degree;
+
+    uint32_t* edgePtr = (uint32_t*) (data + 12);
+    *edgePtr = wl.edgeIndex;
+
+    return data;
+}
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
@@ -58,7 +89,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr
+PacketPtr&
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -73,19 +104,24 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-
-PacketPtr getWritePacket(Addr addr,
-               unsigned int size,
-               uint8_t* data,
-               RequestorID requestorId)
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+            uint8_t* data, RequestorID requestorId)
 {
-    equestPtr req = std::make_shared<Request>(addr, size, 0,
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
 
-PacketPtr
+PacketPtr&
 getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
@@ -102,29 +138,3 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
-WorkListItem&
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
-
-    uint32_t prop = *((uint32_t*) (data + 4));
-
-    uint32_t degree = *((uint32_t*) (data + 8));
-
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    retrun wl  = {temp_prop, prop, degree, addr};
-}
-
-unit8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
-    uint_8* data = new uint8_t [data_size];
-    uint_32* wList = (uint_32*)data;
-    *wList = wl.prop;
-    *wList + 1 = wl.temp_prop;
-    *wList + 2 = wl.degree;
-    *wList + 3 = wl.edgeIndex;
-
-    return data;
-}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 737d52e2a1..da5a0736c9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,7 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -50,5 +49,9 @@ uint8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr& getReadPacket(Addr addr, unsigned int size,
+                            RequestorID requestorId);
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+                uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);

From 31ce9a843295e73498a52ebbc70ed190ec9158d8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 13:05:49 -0800
Subject: [PATCH 016/279] More bug fixes.

---
 src/accl/SConscript     |  5 +++--
 src/accl/apply.cc       |  5 +++++
 src/accl/apply.hh       |  5 +++++
 src/accl/push_engine.cc |  6 +++++-
 src/accl/push_engine.hh |  5 ++++-
 src/accl/util.hh        |  3 +--
 src/accl/wl_engine.cc   |  4 ++++
 src/accl/wl_engine.hh   | 11 ++++++++---
 8 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index 4b78ff9e80..18ac71eb7d 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -27,10 +27,11 @@
 
 Import('*')
 
-SimObject('WLEngine.py')
 SimObject('Apply.py')
 SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('wl_engine.cc')
 Source('push_engine.cc')
+Source('wl_engine.cc')
+Source('util.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 985e6217d7..678f240bf6 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,9 @@
 
 #include <string>
 
+namespace gem5
+{
+
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
@@ -225,3 +228,5 @@ Apply::processNextApplyEvent(){
         schedule(nextApplyEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f4dabd6a97..42cb310136 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -42,6 +42,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class Apply : public ClockedObject
 {
   private:
@@ -177,4 +180,6 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
+}
+
 #endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index f1f8f7698b..57fa560ff7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,9 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-// #include "debug/PushEngine.hh"
+
+namespace gem5
+{
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
@@ -243,3 +245,5 @@ void PushEngine::processNextSendEvent()
         schedule(nextSendEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 077c61aa2b..cc129076a5 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,7 +39,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
-//FIXME: Add gem5 namespace here
+namespace gem5
+{
+
 class PushEngine : public ClockedObject
 {
   private:
@@ -134,4 +136,5 @@ class PushEngine : public ClockedObject
 
 };
 
+}
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/util.hh b/src/accl/util.hh
index da5a0736c9..76d67ce6df 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -51,7 +51,6 @@ uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr&
-getWritePacket(Addr addr, unsigned int size,
+PacketPtr& getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 757bdd2598..00371e56cc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+namespace gem5
+{
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
@@ -242,3 +244,5 @@ WLEngine::processNextWLReduceEvent(){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 0393cd4cb5..8c69bba7f7 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -41,6 +41,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -117,7 +120,7 @@ class WLEngine : public ClockedObject
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
@@ -132,7 +135,7 @@ class WLEngine : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
 
 
     //Events
@@ -174,4 +177,6 @@ class WLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_WLE_HH__
\ No newline at end of file
+}
+
+#endif // __ACCL_WLE_HH__

From fb1dda4caa40ce661c2bcc18181b9113db5231e0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 14:18:14 -0800
Subject: [PATCH 017/279] Even more bug fixes.

---
 src/accl/push_engine.cc | 28 +++++++++++++++++++++++-----
 src/accl/push_engine.hh | 35 +++++++++++++++++++++--------------
 src/accl/util.cc        | 24 ++++++++++++++----------
 src/accl/util.hh        | 18 ++++++++++++------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 57fa560ff7..56a57e76ac 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -65,6 +65,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
     WorkListItem vertices [5] = {
                                 {0, 0, 3, 0}, // Addr: 0
                                 {0, 0, 1, 3}, // Addr: 16
@@ -109,6 +111,7 @@ PushEngine::PushRespPort::getAddrRanges()
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
 {
+    //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
     //     vertexQueue.push(pkt)
     //         vertexQueueLen++;
@@ -192,20 +195,19 @@ bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
-                this->name());
         _blocked = true;
     }
 }
 
-void PushEngine::handleMemResp(PacketPtr pkt)
+bool PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -230,8 +232,12 @@ void PushEngine::handleMemResp(PacketPtr pkt)
     if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextSendEvent, nextCycle());
     }
-}
 
+    //TODO: Should we always return true? It's the response from the memory
+    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
+    // higher than its demand bandwidth
+    return true;
+}
 
 void PushEngine::processNextSendEvent()
 {
@@ -246,4 +252,16 @@ void PushEngine::processNextSendEvent()
     }
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index cc129076a5..7b5f483431 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,10 +54,10 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        //TODO: Implement this;
-        PushRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushRespPort(const std::string& name, PushEngine* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     }
@@ -65,27 +65,32 @@ class PushEngine : public ClockedObject
     class PushReqPort : public RequestPort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushReqPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
     class PushMemPort : public RequestPort
     {
       private:
+        PushEngine* owner
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        PushMemPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
         void sendPacket(PacktPtr pkt);
         bool blocked() { return _blocked; }
@@ -106,9 +111,9 @@ class PushEngine : public ClockedObject
     // int vertexQueueSize;
     // int vertexQueueLen;
 
-    std::unordered_map<req, Addr> reqOffsetMap;
-    std::unordered_map<req, int> reqNumEdgeMap;
-    std::unordered_map<req, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
@@ -127,6 +132,8 @@ class PushEngine : public ClockedObject
 
     bool handleUpdate(PacketPtr pkt);
 
+    bool handleMemResp(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/util.cc b/src/accl/util.cc
index b81ba4db7d..40a1fc761b 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,18 +28,20 @@
 
 #include "accl/util.hh"
 
-WorkListItem&
+namespace gem5
+{
+
+WorkListItem
 memoryToWorkList(uint8_t* data){
     WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
 
+    uint32_t temp_prop = *((uint32_t*) data);
     uint32_t prop = *((uint32_t*) (data + 4));
-
     uint32_t degree = *((uint32_t*) (data + 8));
-
     uint32_t addr = *((uint32_t*) (data + 12));
 
-    retrun wl  = {temp_prop, prop, degree, addr};
+    wl  = {temp_prop, prop, degree, addr};
+    return wl;
 }
 
 uint8_t*
@@ -63,7 +65,7 @@ workListToMemory(WorkListItem wl){
 }
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge&
+Edge
 memoryToEdge(uint8_t *data)
 {
     uint64_t weight = *((uint64_t*) data);
@@ -89,7 +91,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr&
+PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -104,7 +106,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-PacketPtr&
+PacketPtr
 getWritePacket(Addr addr, unsigned int size,
             uint8_t* data, RequestorID requestorId)
 {
@@ -121,8 +123,9 @@ getWritePacket(Addr addr, unsigned int size,
     return pkt;
 }
 
-PacketPtr&
-getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size,
+            uint8_t *data, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
@@ -138,3 +141,4 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 76d67ce6df..91692488a4 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -28,6 +28,10 @@
 
 #include "base/types.hh"
 #include "mem/packet.hh"
+#include "mem/request.hh"
+
+namespace gem5
+{
 
 struct WorkListItem
 {
@@ -41,16 +45,18 @@ struct Edge
 {
     uint64_t weight;
     Addr neighbor;
-}
+};
 
-WorkListItem& memoryToWorkList(uint8_t* data);
+WorkListItem memoryToWorkList(uint8_t* data);
 uint8_t* workListToMemory(WorkListItem wl);
 
-Edge& memoryToEdge(uint8_t* data);
+Edge memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size,
+PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr& getWritePacket(Addr addr, unsigned int size,
+PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
+}

From d2584a1eb9c19f278ed658e4619f8944dd4de4c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 15:46:21 -0800
Subject: [PATCH 018/279] Bug fixes, bug fixes everywhere.

---
 src/accl/apply.cc       | 12 ++++----
 src/accl/apply.hh       | 61 ++++++++++++++++++++---------------------
 src/accl/push_engine.cc |  8 +++++-
 src/accl/push_engine.hh | 17 ++++++------
 src/accl/wl_engine.hh   | 17 +++++-------
 5 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 678f240bf6..c44738d3fa 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -62,14 +62,14 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
+Apply::ApplyRespPort::getAddrRanges()
 {
     return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWL(pkt)){
+    if (!owner->handleWL(pkt)){
         return false;
     }
     return true;
@@ -82,15 +82,17 @@ Apply::ApplyRespPort::trySendRetry()
 }
 
 
-virtual bool
+bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
-WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 42cb310136..788550646a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -33,14 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
-#include "base/types.hh"
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/Apply.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -58,18 +57,18 @@ class Apply : public ClockedObject
         return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue->empty();
+        return applyQueue.empty();
       }
       void push(PacketPtr pkt){
-        applyQueue->push(pkt);
+        applyQueue.push(pkt);
       }
 
       void pop(){
-        applyQueue->pop();
+        applyQueue.pop();
       }
 
       void front(){
-        applyQueue->front();
+        applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -80,16 +79,17 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
+        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyRespPort(const std::string& name, Apply* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
-        virtual AddrRangeList getAddrRanges();
         void trySendRetry();
-
-      protected:
+        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     };
 
@@ -101,12 +101,13 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyReqPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -121,13 +122,14 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyMemPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
         void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked(){ return _blocked;}
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -138,28 +140,24 @@ class Apply : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readApplyBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
     void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-
     System* const system;
     const RequestorID requestorId;
 
@@ -170,13 +168,14 @@ class Apply : public ClockedObject
 
     ApplyMemPort memPort;
     ApplyRespPort respPort;
-    ApplyRequestPort reqPort;
+    ApplyReqPort reqPort;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
     Apply(const ApplyParams &apply);
-    Port &getPort(const std::string &if_name,
+
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 56a57e76ac..48f1115042 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->memPort->getAddrRanges();
+    owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -264,4 +264,10 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 7b5f483431..d478d14df0 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -31,8 +31,7 @@
 
 #include <queue>
 
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
@@ -60,7 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
-    }
+    };
 
     class PushReqPort : public RequestPort
     {
@@ -77,12 +76,12 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class PushMemPort : public RequestPort
     {
       private:
-        PushEngine* owner
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -92,10 +91,10 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     virtual void startup() override;
 
@@ -134,11 +133,13 @@ class PushEngine : public ClockedObject
 
     bool handleMemResp(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges();
+
   public:
 
     PushEngine(const PushEngineParams &params);
 
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
 };
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8c69bba7f7..6f875adfed 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -33,13 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -140,40 +140,37 @@ class WLEngine : public ClockedObject
 
     //Events
     void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
-    void processNextWLReadEvent();
-    EventFunctionWrapper nextWLReadEvent;
-
-    void processNextWLReduceEvent();
-    EventFunctionWrapper nextWLReduceEvent;
 
     System* const system;
     const RequestorID requestorId;
+
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
 
     WLQueue updateQueue;
     WLQueue responseQueue;
-    WLMemPort memPort;
 
     WLMemPort memPort;
     WLRespPort respPort;
-    WLRequestPort reqPort;
+    WLReqPort reqPort;
 
    public:
 
     WLEngine(const WLEngineParams &params);
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 

From 8e6a8d51e19b50cd6b61cb60c939bb490b486c23 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:33:53 -0800
Subject: [PATCH 019/279] arch: Debugging worklist engine

[wip] Adding some missing virtual functions.

Change-Id: I26f6c7d789f4b295bac3bc9b2a80f2cadb45b96f
---
 src/accl/wl_engine.cc | 26 +++++++++++++++++++++++++-
 src/accl/wl_engine.hh |  4 ++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 00371e56cc..7515e10167 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -81,6 +81,24 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
+virtual void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+virtual Tick
+WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+virtual void
+WLEngine::WLRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -137,6 +155,12 @@ WLEngine::getAddrRanges() const
     return memPort.getAddrRanges();
 }
 
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.recvFunctional(pkt);
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
@@ -164,7 +188,7 @@ void WLEngine::processNextWLReadEvent(){
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
-            queue.pop()
+            queue.pop();
             memPort.sendPacket(memPkt);
             break;
         }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 6f875adfed..d2b96db203 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,6 +88,9 @@ class WLEngine : public ClockedObject
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class WLReqPort : public RequestPort //To Apply Engine
@@ -159,6 +162,7 @@ class WLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
+    void recvFunctional(PacketPtr pkt);
 
     WLQueue updateQueue;
     WLQueue responseQueue;

From 6b2bf359d3b83767dce29fb779ff098bad9c46a7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:38:05 -0800
Subject: [PATCH 020/279] Bug fix.

---
 src/accl/Apply.py       |  1 +
 src/accl/apply.cc       |  6 ++---
 src/accl/push_engine.cc | 50 ++++++++++++++++++++++++++++++++---------
 src/accl/push_engine.hh |  3 +++
 4 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 01c627d4c8..58639e880a 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -34,6 +34,7 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
+    system = Param.System(Parent.any, "The system object this apply engine is a part of")
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index c44738d3fa..70bc8031c9 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -40,8 +40,8 @@ Apply::Apply(const ApplyParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {
     applyReadQueue(params.applyQueueSize);
     applyWriteQueue(params.applyQueueSize);
@@ -172,7 +172,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacktPtr pkt)
+Apply::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 48f1115042..6ebe34ebd3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->getAddrRanges();
+    return owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -121,7 +121,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt)
+    vertexQueue.push(pkt);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -130,8 +130,8 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 
 void PushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t *data = updatePkt->getData<uint8_t>();
+    PacketPtr updatePkt = vertexQueue.front();
+    uint8_t *data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
@@ -152,19 +152,19 @@ void PushEngine::processNextReceiveEvent()
                 num_edge_queue.back()++;
             }
             else {
-                addr_queue.push(req_addr);
-                offset_queue.push(req_offset);
-                num_edge_queue.push(1);
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
             }
         }
         else {
-            addr_queue.push(req_addr);
-            offset_queue.push(req_offset);
-            num_edge_queue.push(1);
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
         }
     }
 
-    for (int index = 0; index < addr_queue.size(); inedx++) {
+    for (int index = 0; index < addr_queue.size(); index++) {
         PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
@@ -172,6 +172,8 @@ void PushEngine::processNextReceiveEvent()
         reqValueMap[pkt->req] = value;
     }
 
+    vertexQueue.pop();
+
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -264,10 +266,36 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
     return memPort.getAddrRanges();
 }
 
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index d478d14df0..0acedd0da8 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -59,6 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +77,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     class PushMemPort : public RequestPort
@@ -94,6 +96,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     virtual void startup() override;

From ea2c878cd465e01f3c775cf67bec3aacd8416c09 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:46:27 -0800
Subject: [PATCH 021/279] Bug fix.

---
 src/accl/Apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 58639e880a..d6a4bbe5a9 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class Apply(ClockedObject):

From 2ec9667fdd8eb568c4be817b2eb700f7ea71579e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:47:36 -0800
Subject: [PATCH 022/279] Fixing a bug-fix.

---
 src/accl/apply.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 788550646a..e1b6d33359 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -88,6 +88,7 @@ class Apply : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
+      protected:
         void trySendRetry();
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);

From d94f0028cf5ad831447e6b661f8b9f615d3085ca Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 00:13:21 -0800
Subject: [PATCH 023/279] fixing some bugs

---
 src/accl/Apply.py     |  1 +
 src/accl/WLEngine.py  |  4 +++-
 src/accl/apply.cc     | 31 ++++++++++++++++++++----
 src/accl/apply.hh     | 23 ++++++++++--------
 src/accl/wl_engine.cc | 48 +++++++++++++++++++------------------
 src/accl/wl_engine.hh | 55 ++++++++++++++++++++++++-------------------
 6 files changed, 99 insertions(+), 63 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index d6a4bbe5a9..8720287cc8 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -38,3 +38,4 @@ class Apply(ClockedObject):
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
+    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
index fe6b25b6ba..562fd04423 100644
--- a/src/accl/WLEngine.py
+++ b/src/accl/WLEngine.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class WLEngine(ClockedObject):
@@ -34,6 +34,8 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
     respPort = ResponsePort("Receives updates")
     reqPort  = RequestPort("Sends requests to Apply")
     memPort  = RequestPort("Memory side port, sends requests")
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 70bc8031c9..410eff5268 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -81,6 +81,23 @@ Apply::ApplyRespPort::trySendRetry()
     sendRetryReq();
 }
 
+void
+Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
+{
+    panic("Not implemented");
+}
+
+Tick
+Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+Apply::ApplyRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
 
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
@@ -139,7 +156,7 @@ Apply::getAddrRanges() const
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
-        sendPktRetry = true;
+        queue.sendPktRetry = true;
         return false;
     } else{
         queue.push(pkt);
@@ -177,7 +194,7 @@ Apply::handleMemResp(PacketPtr pkt)
     auto queue = applyWriteQueue;
 
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
         } else
             queue.push(pkt);
@@ -192,7 +209,7 @@ Apply::handleMemResp(PacketPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-        auto pkt = queue.front();
+        PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
         RequestPtr request = pkt->req;
@@ -204,7 +221,11 @@ Apply::processNextApplyEvent(){
         if (temp_prop != prop){
             if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = std::min(prop , temp_prop);
+                if(prop < temp_prop){
+                    wl.prop = prop;
+                }else{
+                    wl.prop = temp_prop;
+                }
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
@@ -212,7 +233,7 @@ Apply::processNextApplyEvent(){
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
                 memPort.sendPacket(writePkt);
-                applyReqPort.sendPacket(writePkt);
+                reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
                     memPort.trySendRetry();
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e1b6d33359..f08c1fef85 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,12 +63,12 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      void pop(){
-        applyQueue.pop();
+      PacketPtr pop(){
+        return applyQueue->pop();
       }
 
-      void front(){
-        applyQueue.front();
+      PacketPtr front(){
+        return applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -83,15 +83,18 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
       protected:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class ApplyReqPort : public RequestPort
@@ -137,6 +140,10 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    ApplyMemPort memPort;
+    ApplyRespPort respPort;
+    ApplyReqPort reqPort;
+
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -167,10 +174,6 @@ class Apply : public ClockedObject
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7515e10167..9b16a15575 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -36,6 +36,7 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
+    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
@@ -43,8 +44,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(params.wlQueueSize);
-    responseQueue(params.wlQueueSize);
+    updateQueue.resize(queueSize);
+    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -69,7 +70,7 @@ WLEngine::WLRespPort::getAddrRanges() const
 
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWLUpdate(pkt)){
+    if (!owner->handleWLUpdate(pkt)){
         return false;
     }
     return true;
@@ -81,19 +82,19 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
     owner->recvFunctional(pkt);
 }
 
-virtual Tick
+Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
     panic("recvAtomic unimpl.");
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvRespRetry()
 {
     panic("recvRespRetry from response port is called.");
@@ -118,10 +119,10 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-virtual bool
+bool
 WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
@@ -177,15 +178,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    auto memPort = WLMemPort;
     while (!queue.empty()){ //create a map instead of front
-        auto pkt = queue.front()
+        PacketPtr pkt = queue.front();
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
         RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
             queue.pop();
@@ -199,15 +199,15 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacktPtr pkt)
+WLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
-        } else
-            queue.push(writePkt);
-
+        } else{
+            queue.push(pkt);
+        }
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
         }
@@ -219,18 +219,20 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    applyPort = reqPort;
-    auto update = updateQ.front();
-    auto value = update->getPtr<uint8_t>();
-    auto pkt = queue.front();
+    auto applyPort = reqPort;
+    PacketPtr update = updateQ.front();
+    uint8_t* value = update->getPtr<uint8_t>();
+    PacketPtr pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    WorkListItem wl =  memoryToWorkList(data + request_offset);
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = std::min(value , temp_prop);
+        if(*value < temp_prop){
+            temp_prop = *value;
+        }
         if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
             uint8_t* wlItem = workListToMemory(wl);
@@ -257,7 +259,7 @@ WLEngine::processNextWLReduceEvent(){
             memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
-        updateQ.pop()
+        updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
             respPort.trySendRetry();
             updateQ.sendPktRetry = false;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index d2b96db203..8d02c16981 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -50,27 +50,32 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint32_t queueSize;
+      uint32_t queueSize;
       bool sendPktRetry;
 
+      void resize(uint32_t size){
+        queueSize = size;
+      }
+
       bool blocked(){
         return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue->empty();
+        return wlQueue.empty();
       }
       void push(PacketPtr pkt){
-        wlQueue->push(pkt);
+        wlQueue.push(pkt);
       }
       void pop(){
-        wlQueue->pop();
+        wlQueue.pop();
       }
-      void front(){
-        wlQueue.front());
+      PacketPtr front(){
+        return wlQueue.front();
       }
 
       WLQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class WLRespPort : public ResponsePort //From Push engine
@@ -83,7 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const override;
         void trySendRetry();
 
       protected:
@@ -129,50 +134,52 @@ class WLEngine : public ClockedObject
           return _blocked;
         }
 
-    protected:
-      virtual bool recvTimingResp(PacketPtr pkt);
-      void recvReqRetry() override;
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
+    System* const system;
+    const uint32_t queueSize;
+    const RequestorID requestorId;
+
+    WLReqPort reqPort;
+    WLRespPort respPort;
+    WLMemPort memPort;
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacketPtr resp);
 
 
     //Events
-    void processNextWLReadEvent();
     EventFunctionWrapper nextWLReadEvent;
+    void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
+    void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    AddrRangeList getAddrRanges() const;
-    void recvFunctional(PacketPtr pkt);
-
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-    WLMemPort memPort;
-    WLRespPort respPort;
-    WLReqPort reqPort;
 
-   public:
 
+   public:
+    AddrRangeList getAddrRanges() const;
+    bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
+    void recvFunctional(PacketPtr pkt);
     WLEngine(const WLEngineParams &params);
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;

From 19be1374b0506194c6ce92546336d3b405fd066e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:24:55 -0800
Subject: [PATCH 024/279] Bug fix.

---
 src/accl/push_engine.cc | 26 ++++++++++++++++++++------
 src/accl/push_engine.hh | 13 ++++++++++++-
 src/accl/wl_engine.cc   |  9 ++-------
 src/accl/wl_engine.hh   |  3 +--
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 6ebe34ebd3..746ed8a142 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -98,18 +98,32 @@ PushEngine::startup()
 
 }
 
-bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+Tick
+PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
 {
-    return owner->getAddrRanges();
+    panic("recvAtomic unimpl.");
+}
+
+void
+PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
 }
 
-bool PushEngine::handleUpdate(PacketPtr pkt)
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -131,7 +145,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
-    uint8_t *data = updatePkt->getPtr<uint8_t>();
+    uint8_t* data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 0acedd0da8..1aa70c7acb 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -58,8 +58,12 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
 
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +80,8 @@ class PushEngine : public ClockedObject
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -95,6 +101,8 @@ class PushEngine : public ClockedObject
 
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -138,6 +146,8 @@ class PushEngine : public ClockedObject
 
     AddrRangeList getAddrRanges();
 
+    void recvFunctional(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
@@ -148,4 +158,5 @@ class PushEngine : public ClockedObject
 };
 
 }
+
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 9b16a15575..bfabed33e9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -76,12 +76,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -162,7 +156,8 @@ WLEngine::recvFunctional(PacketPtr pkt)
     memPort.recvFunctional(pkt);
 }
 
-bool WLEngine::handleWLUpdate(PacketPtr pkt){
+bool
+WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8d02c16981..ad53fd7e7e 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,8 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges() const override;
-        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From 615a4a906fb163fd6b2a34afc516fb0dc519d92f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:36:23 -0800
Subject: [PATCH 025/279] Bug fix.

---
 src/accl/push_engine.cc | 24 ++++++++++++++++++------
 src/accl/push_engine.hh | 13 +++++--------
 src/accl/wl_engine.cc   |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 746ed8a142..bf385818f5 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -122,6 +122,24 @@ PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
     owner->recvFunctional(pkt);
 }
 
+void
+PushEngine::PushRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+PushEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.sendFunctional(pkt);
+}
+
 bool
 PushEngine::handleUpdate(PacketPtr pkt)
 {
@@ -293,12 +311,6 @@ PushEngine::PushReqPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
 void
 PushEngine::PushMemPort::recvReqRetry()
 {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 1aa70c7acb..269170c045 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -131,23 +131,20 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
-    bool handleUpdate(PacketPtr pkt);
-
-    bool handleMemResp(PacketPtr pkt);
-
-    AddrRangeList getAddrRanges();
-
-    void recvFunctional(PacketPtr pkt);
-
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index bfabed33e9..8365e754fc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -153,7 +153,7 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    memPort.recvFunctional(pkt);
+    memPort.sendFunctional(pkt);
 }
 
 bool

From 927bc4f8925a4198594238921770150a49078137 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:50:05 -0800
Subject: [PATCH 026/279] Bug fixes.

---
 src/accl/wl_engine.cc | 16 +++++++++++-----
 src/accl/wl_engine.hh |  6 +++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 8365e754fc..872f38673e 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -41,11 +41,11 @@ WLEngine::WLEngine(const WLEngineParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
+    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
+    updateQueue(queueSize),
+    responseQueue(queueSize)
 {
-    updateQueue.resize(queueSize);
-    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -88,6 +88,12 @@ WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
     panic("recvAtomic unimpl.");
 }
 
+void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
 void
 WLEngine::WLRespPort::recvRespRetry()
 {
@@ -256,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ad53fd7e7e..fe26d22aef 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -172,14 +172,14 @@ class WLEngine : public ClockedObject
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-
-
-   public:
     AddrRangeList getAddrRanges() const;
     bool handleWLUpdate(PacketPtr pkt);
     bool handleMemResp(PacketPtr resp);
     void recvFunctional(PacketPtr pkt);
+
+   public:
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };

From 12fe98910d1eccc7f65f7d9f4eb25b7d7471c03e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:56:40 -0800
Subject: [PATCH 027/279] Bug fix.

---
 src/accl/push_engine.hh |  5 +----
 src/accl/wl_engine.cc   |  2 +-
 src/accl/wl_engine.hh   | 28 +++++++++++++---------------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 269170c045..ea9026ff8f 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -49,13 +49,10 @@ class PushEngine : public ClockedObject
     {
       private:
         PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
         PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges();
 
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 872f38673e..98c940a2de 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -262,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
+            respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index fe26d22aef..94ac7c7aff 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -82,12 +82,11 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        PacketPtr blockedPacket;
 
       public:
-        WLRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        WLRespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
         virtual AddrRangeList getAddrRanges();
 
       protected:
@@ -105,12 +104,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLReqPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -125,13 +124,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLMemPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);

From c581b5f0a44818898571e7a1cf5078e54ebd56f7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 13:44:39 -0800
Subject: [PATCH 028/279] Bug fix.

---
 src/accl/apply.cc       | 12 ------------
 src/accl/apply.hh       | 34 +++++++++++++++-------------------
 src/accl/push_engine.cc |  2 +-
 src/accl/push_engine.hh |  2 +-
 src/accl/wl_engine.hh   |  2 +-
 5 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 410eff5268..b493d3d152 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -75,12 +75,6 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -116,12 +110,6 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 Apply::ApplyMemPort::recvReqRetry()
 {
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f08c1fef85..6ab639c552 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -64,7 +64,7 @@ class Apply : public ClockedObject
       }
 
       PacketPtr pop(){
-        return applyQueue->pop();
+        return applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -79,16 +79,12 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
@@ -140,16 +136,24 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    System* const system;
+    const RequestorID requestorId;
+
     ApplyMemPort memPort;
     ApplyRespPort respPort;
     ApplyReqPort reqPort;
 
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
+
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     bool handleWL(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readApplyBuffer();
+    // bool sendPacket();
+    // //one queue for write and one for read a priotizes write over read
+    // void readApplyBuffer();
     bool handleMemResp(PacketPtr resp);
-    void writePushBuffer();
+    // void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
@@ -166,16 +170,8 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     AddrRangeList getAddrRanges() const;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bf385818f5..fde79a5aa7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -99,7 +99,7 @@ PushEngine::startup()
 }
 
 AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+PushEngine::PushRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index ea9026ff8f..fbb7d6915a 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,7 +54,7 @@ class PushEngine : public ClockedObject
         PushRespPort(const std::string& name, PushEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 94ac7c7aff..504b63bc46 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -87,7 +87,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, WLEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From a6c9f3f0ef5c8158aecf1649dfd7e3197076745a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:02:08 -0800
Subject: [PATCH 029/279] Apply engine compiles

---
 src/accl/apply.cc | 33 +++++++++++++++++++++++++++------
 src/accl/apply.hh | 45 ++++++++++++++++++++++-----------------------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b493d3d152..55288693f3 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -41,10 +41,12 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    applyReadQueue(params.applyQueueSize),
+    applyWriteQueue(params.applyQueueSize)
 {
-    applyReadQueue(params.applyQueueSize);
-    applyWriteQueue(params.applyQueueSize);
+    // applyReadQueue(params.applyQueueSize);
+    // applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -62,7 +64,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges()
+Apply::ApplyRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
@@ -93,6 +95,12 @@ Apply::ApplyRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
@@ -118,6 +126,12 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
 void
 Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
@@ -135,6 +149,12 @@ Apply::ApplyReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -158,7 +178,8 @@ bool Apply::handleWL(PacketPtr pkt){
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
-        auto pkt = queue.pop();
+        PacketPtr pkt = queue.front();
+        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
                 respPort.trySendRetry();
                 queue.sendPktRetry = false;
@@ -229,7 +250,7 @@ Apply::processNextApplyEvent(){
                 }
             }
         }else{
-            queue.pop();
+            queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
                 memPort.trySendRetry();
                 queue.sendPktRetry = false;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 6ab639c552..7f17e173c6 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,8 +63,8 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      PacketPtr pop(){
-        return applyQueue.pop();
+      void pop(){
+        applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -72,20 +72,20 @@ class Apply : public ClockedObject
       }
 
       ApplyQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-
       public:
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges() const;
-
+        void trySendRetry();
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);
@@ -105,7 +105,6 @@ class Apply : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
@@ -139,9 +138,24 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
     ApplyReqPort reqPort;
+    ApplyRespPort respPort;
+    ApplyMemPort memPort;
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -155,21 +169,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
-    //Events
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
     AddrRangeList getAddrRanges() const;
 
   public:

From 6ef32d48e9248b792dd413d41fc1fad7976d4585 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:22:29 -0800
Subject: [PATCH 030/279] Bug fix. Very close to first compilation.

---
 src/accl/apply.cc       |  60 +++++++------------
 src/accl/apply.hh       |  87 ++++++++++++++-------------
 src/accl/push_engine.cc | 126 ++++++++++++++++++++++------------------
 src/accl/util.hh        |  14 +++++
 src/accl/wl_engine.cc   |  22 ++-----
 src/accl/wl_engine.hh   |   3 +-
 6 files changed, 153 insertions(+), 159 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 55288693f3..9c3d3f1c3d 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -37,17 +39,14 @@ Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize)
-{
-    // applyReadQueue(params.applyQueueSize);
-    // applyWriteQueue(params.applyQueueSize);
-}
+    applyWriteQueue(params.applyQueueSize),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+{}
 
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
@@ -96,22 +95,8 @@ Apply::ApplyRespPort::recvRespRetry()
 }
 
 void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
@@ -119,30 +104,27 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
 {
-    sendRetryResp();
+    panic("recvTimingResp called on reqPort.");
 }
 
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
+bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyReqPort::recvReqRetry()
+Apply::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -179,9 +161,8 @@ void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
-        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 queue.sendPktRetry = false;
         }
         // conver to ReadReq
@@ -190,7 +171,8 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+        memPort.sendPacke:(memPkt);
+        queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -245,14 +227,14 @@ Apply::processNextApplyEvent(){
                 reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
-                    memPort.trySendRetry();
+                    // memPort.trySendRetry();
                     queue.sendPktRetry = false;
                 }
             }
         }else{
             queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
         }
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 7f17e173c6..2a16632e22 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -49,31 +48,31 @@ class Apply : public ClockedObject
   private:
 
     struct ApplyQueue{
-      std::queue<PacketPtr> applyQueue;
-      const uint32_t queueSize;
-      bool sendPktRetry;
-
-      bool blocked(){
-        return (applyQueue.size() == queueSize);
-      }
-      bool empty(){
-        return applyQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        applyQueue.push(pkt);
-      }
-
-      void pop(){
-        applyQueue.pop();
-      }
-
-      PacketPtr front(){
-        return applyQueue.front();
-      }
-
-      ApplyQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
+        std::queue<PacketPtr> applyQueue;
+        const uint32_t queueSize;
+        bool sendPktRetry;
+
+        bool blocked(){
+            return (applyQueue.size() == queueSize);
+        }
+        bool empty(){
+            return applyQueue.empty();
+        }
+        void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+        }
+
+        void pop(){
+            applyQueue.pop();
+        }
+
+        PacketPtr front(){
+            return applyQueue.front();
+        }
+
+        ApplyQueue(uint32_t qSize):
+          queueSize(qSize)
+        {}
     };
 
     class ApplyRespPort : public ResponsePort
@@ -109,8 +108,8 @@ class Apply : public ClockedObject
         bool blocked() { return _blocked; }
 
       protected:
-        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
     class ApplyMemPort : public RequestPort
@@ -127,7 +126,7 @@ class Apply : public ClockedObject
         {}
 
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
+        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -138,25 +137,10 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyReqPort reqPort;
     ApplyRespPort respPort;
+    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -169,6 +153,21 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
+    //Events
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
     AddrRangeList getAddrRanges() const;
 
   public:
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index fde79a5aa7..125433653b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,9 +26,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
 #include "accl/push_engine.hh"
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -128,6 +129,68 @@ PushEngine::PushRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
@@ -224,24 +287,8 @@ void PushEngine::processNextReadEvent()
     }
 }
 
-bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool PushEngine::handleMemResp(PacketPtr pkt)
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -259,7 +306,8 @@ bool PushEngine::handleMemResp(PacketPtr pkt)
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
         updateQueue.push(update);
     }
 
@@ -286,42 +334,4 @@ void PushEngine::processNextSendEvent()
     }
 }
 
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-PushEngine::PushMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 }
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 91692488a4..b3cff93f15 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
@@ -39,12 +40,25 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
+
+    std::string to_string()
+    {
+        return csprintf(
+        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        temp_prop, prop, degree, edgeIndex);
+    }
+
 };
 
 struct Edge
 {
     uint64_t weight;
     Addr neighbor;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
+    }
 };
 
 WorkListItem memoryToWorkList(uint8_t* data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 98c940a2de..eb883cb19b 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -76,12 +78,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
 Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
@@ -125,12 +121,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 WLEngine::WLReqPort::recvReqRetry()
 {
@@ -244,12 +234,12 @@ WLEngine::processNextWLReduceEvent(){
             applyPort.sendPacket(writePkt);
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
             if (!updateQ.blocked() & updateQ.sendPktRetry){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 updateQ.sendPktRetry = false;
             }
         }
@@ -257,12 +247,12 @@ WLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            memPort.trySendRetry();
+            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 504b63bc46..ee25154caa 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -140,8 +139,8 @@ class WLEngine : public ClockedObject
     const uint32_t queueSize;
     const RequestorID requestorId;
 
-    WLReqPort reqPort;
     WLRespPort respPort;
+    WLReqPort reqPort;
     WLMemPort memPort;
 
     bool handleWLU(PacketPtr pkt);

From 14ca405b83b04ee39c6533f72e081587b34b162f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:46:20 -0800
Subject: [PATCH 031/279] More bug fixes.

---
 src/accl/apply.cc     |  8 +-------
 src/accl/util.hh      |  3 ++-
 src/accl/wl_engine.cc | 12 +++++-------
 src/accl/wl_engine.hh | 18 ++++++------------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 9c3d3f1c3d..b18c990da2 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -131,12 +131,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -171,7 +165,7 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacke:(memPkt);
+        memPort.sendPacket(memPkt);
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
diff --git a/src/accl/util.hh b/src/accl/util.hh
index b3cff93f15..a4418a1cb8 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -71,6 +71,7 @@ PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
 PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size,
+                uint8_t *data, RequestorID requestorId);
 
 }
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index eb883cb19b..614f34d175 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -38,17 +38,15 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
-    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
+    updateQueue(params.wlQueueSize),
+    responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
-    updateQueue(queueSize),
-    responseQueue(queueSize)
-{
-}
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
+{}
 
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ee25154caa..57cc063880 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -136,26 +136,26 @@ class WLEngine : public ClockedObject
     };
 
     System* const system;
-    const uint32_t queueSize;
     const RequestorID requestorId;
 
     WLRespPort respPort;
     WLReqPort reqPort;
     WLMemPort memPort;
 
-    bool handleWLU(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readWLBuffer();
+    WLQueue updateQueue;
+    WLQueue responseQueue;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -164,14 +164,8 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    std::unordered_map<RequestPtr, int> requestOffset;
-
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
     AddrRangeList getAddrRanges() const;
-    bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
     void recvFunctional(PacketPtr pkt);
 
    public:

From df282ddc4a0f32d8a9592c000500fcabd19f2893 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:53:21 -0800
Subject: [PATCH 032/279] Compilation. yeay.

---
 src/accl/apply.cc     |  9 +++++++++
 src/accl/wl_engine.cc | 40 +++++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b18c990da2..40002c5264 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -117,6 +117,15 @@ Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
     panic("recvTimingResp called on reqPort.");
 }
 
+void
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 614f34d175..d2ecd0d7c9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -94,17 +94,14 @@ WLEngine::WLRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLReqPort::recvTimingResp(PacketPtr)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    panic("recvTimingResp called on the request port.");
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+WLEngine::WLReqPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -113,14 +110,26 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
 }
 
 void
-WLEngine::WLReqPort::recvReqRetry()
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -129,13 +138,10 @@ WLEngine::WLReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 AddrRangeList

From ed2254773d278ec81d63066c897de89e60e73b2b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Feb 2022 10:31:28 -0800
Subject: [PATCH 033/279] Fixing a typo.

---
 src/accl/PushEngine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 3215fdaee2..840d8dea1f 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -30,7 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'WLEngine'
+    type = 'PushEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 

From 1a87fc9f02b9d15957c8b35573d83f7669fa3687 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 18 Feb 2022 14:08:41 -0800
Subject: [PATCH 034/279] Restructuring the directory.

---
 src/accl/{ => graph/base}/Apply.py       |  0
 src/accl/{ => graph/base}/PushEngine.py  |  0
 src/accl/{ => graph/base}/SConscript     |  0
 src/accl/{ => graph/base}/WLEngine.py    |  0
 src/accl/{ => graph/base}/apply.cc       | 73 +--------------------
 src/accl/{ => graph/base}/apply.hh       | 44 +------------
 src/accl/{ => graph/base}/push_engine.cc |  0
 src/accl/{ => graph/base}/push_engine.hh |  0
 src/accl/{ => graph/base}/util.cc        |  0
 src/accl/{ => graph/base}/util.hh        |  0
 src/accl/{ => graph/base}/wl_engine.cc   | 83 +-----------------------
 src/accl/{ => graph/base}/wl_engine.hh   | 49 +-------------
 src/accl/graph/sega/mpu.hh               |  0
 13 files changed, 7 insertions(+), 242 deletions(-)
 rename src/accl/{ => graph/base}/Apply.py (100%)
 rename src/accl/{ => graph/base}/PushEngine.py (100%)
 rename src/accl/{ => graph/base}/SConscript (100%)
 rename src/accl/{ => graph/base}/WLEngine.py (100%)
 rename src/accl/{ => graph/base}/apply.cc (80%)
 rename src/accl/{ => graph/base}/apply.hh (78%)
 rename src/accl/{ => graph/base}/push_engine.cc (100%)
 rename src/accl/{ => graph/base}/push_engine.hh (100%)
 rename src/accl/{ => graph/base}/util.cc (100%)
 rename src/accl/{ => graph/base}/util.hh (100%)
 rename src/accl/{ => graph/base}/wl_engine.cc (79%)
 rename src/accl/{ => graph/base}/wl_engine.hh (75%)
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/src/accl/Apply.py b/src/accl/graph/base/Apply.py
similarity index 100%
rename from src/accl/Apply.py
rename to src/accl/graph/base/Apply.py
diff --git a/src/accl/PushEngine.py b/src/accl/graph/base/PushEngine.py
similarity index 100%
rename from src/accl/PushEngine.py
rename to src/accl/graph/base/PushEngine.py
diff --git a/src/accl/SConscript b/src/accl/graph/base/SConscript
similarity index 100%
rename from src/accl/SConscript
rename to src/accl/graph/base/SConscript
diff --git a/src/accl/WLEngine.py b/src/accl/graph/base/WLEngine.py
similarity index 100%
rename from src/accl/WLEngine.py
rename to src/accl/graph/base/WLEngine.py
diff --git a/src/accl/apply.cc b/src/accl/graph/base/apply.cc
similarity index 80%
rename from src/accl/apply.cc
rename to src/accl/graph/base/apply.cc
index 40002c5264..eae9c2fd16 100644
--- a/src/accl/apply.cc
+++ b/src/accl/graph/base/apply.cc
@@ -30,17 +30,13 @@
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -51,72 +47,13 @@ Apply::Apply(const ApplyParams &params):
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWL(pkt)){
-        return false;
-    }
-    return true;
-}
-
-void
-Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
-{
-    panic("Not implemented");
-}
-
-Tick
-Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-Apply::ApplyRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-Apply::ApplyReqPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on reqPort.");
-}
-
 void
 Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
@@ -140,12 +77,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-AddrRangeList
-Apply::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
diff --git a/src/accl/apply.hh b/src/accl/graph/base/apply.hh
similarity index 78%
rename from src/accl/apply.hh
rename to src/accl/graph/base/apply.hh
index 2a16632e22..a3f0ff5aa3 100644
--- a/src/accl/apply.hh
+++ b/src/accl/graph/base/apply.hh
@@ -46,7 +46,7 @@ namespace gem5
 class Apply : public ClockedObject
 {
   private:
-
+    //FIXME: Remove queue defenition from here.
     struct ApplyQueue{
         std::queue<PacketPtr> applyQueue;
         const uint32_t queueSize;
@@ -75,43 +75,6 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyRespPort : public ResponsePort
-    {
-      private:
-        Apply *owner;
-      public:
-        ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-        void trySendRetry();
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class ApplyReqPort : public RequestPort
-    {
-      private:
-        Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ApplyReqPort(const std::string& name, Apply* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     class ApplyMemPort : public RequestPort
     {
       private:
@@ -134,11 +97,8 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
     const RequestorID requestorId;
 
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
     ApplyQueue applyReadQueue;
@@ -168,8 +128,6 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/graph/base/push_engine.cc
similarity index 100%
rename from src/accl/push_engine.cc
rename to src/accl/graph/base/push_engine.cc
diff --git a/src/accl/push_engine.hh b/src/accl/graph/base/push_engine.hh
similarity index 100%
rename from src/accl/push_engine.hh
rename to src/accl/graph/base/push_engine.hh
diff --git a/src/accl/util.cc b/src/accl/graph/base/util.cc
similarity index 100%
rename from src/accl/util.cc
rename to src/accl/graph/base/util.cc
diff --git a/src/accl/util.hh b/src/accl/graph/base/util.hh
similarity index 100%
rename from src/accl/util.hh
rename to src/accl/graph/base/util.hh
diff --git a/src/accl/wl_engine.cc b/src/accl/graph/base/wl_engine.cc
similarity index 79%
rename from src/accl/wl_engine.cc
rename to src/accl/graph/base/wl_engine.cc
index d2ecd0d7c9..dc8f1dd744 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/graph/base/wl_engine.cc
@@ -26,21 +26,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/wl_engine.hh"
+#include "accl/graph/base/wl_engine.hh"
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -51,74 +47,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-WLEngine::WLRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWLUpdate(pkt)){
-        return false;
-    }
-    return true;
-}
-
-Tick
-WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::WLRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-bool
-WLEngine::WLReqPort::recvTimingResp(PacketPtr)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-WLEngine::WLReqPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -144,18 +79,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
 WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
diff --git a/src/accl/wl_engine.hh b/src/accl/graph/base/wl_engine.hh
similarity index 75%
rename from src/accl/wl_engine.hh
rename to src/accl/graph/base/wl_engine.hh
index 57cc063880..3654999b70 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/graph/base/wl_engine.hh
@@ -46,7 +46,7 @@ namespace gem5
 class WLEngine : public ClockedObject
 {
   private:
-
+    //FIXME: Change this
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
       uint32_t queueSize;
@@ -77,44 +77,6 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLRespPort : public ResponsePort //From Push engine
-    {
-      private:
-        WLEngine *owner;
-
-      public:
-        WLRespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class WLReqPort : public RequestPort //To Apply Engine
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        WLReqPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        void recvReqRetry() override;
-        virtual bool recvTimingResp(PacketPtr pkt);
-    };
-
     class WLMemPort : public RequestPort
     {
       private:
@@ -135,11 +97,6 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
-    const RequestorID requestorId;
-
-    WLRespPort respPort;
-    WLReqPort reqPort;
     WLMemPort memPort;
 
     WLQueue updateQueue;
@@ -164,10 +121,6 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
-    void recvFunctional(PacketPtr pkt);
-
    public:
     WLEngine(const WLEngineParams &params);
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..e69de29bb2

From 254dd925ea2ef471257c0553fb1ce7eeca1be32b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 09:59:09 -0800
Subject: [PATCH 035/279] Restructing the classes.

---
 src/accl/graph/base/Apply.py      |  5 +---
 src/accl/graph/base/PushEngine.py |  3 --
 src/accl/graph/base/WLEngine.py   |  5 +---
 src/accl/graph/sega/MPU.py        | 46 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/Apply.py
index 8720287cc8..80aa430139 100644
--- a/src/accl/graph/base/Apply.py
+++ b/src/accl/graph/base/Apply.py
@@ -34,8 +34,5 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
-    system = Param.System(Parent.any, "The system object this apply engine is a part of")
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/base/PushEngine.py
index 840d8dea1f..7fef165169 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/base/PushEngine.py
@@ -34,7 +34,4 @@ class PushEngine(ClockedObject):
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    system = Param.System(Parent.any, "The system object this push engine is a part of")
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/base/WLEngine.py
index 562fd04423..deaee20935 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/base/WLEngine.py
@@ -34,8 +34,5 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
-    respPort = ResponsePort("Receives updates")
-    reqPort  = RequestPort("Sends requests to Apply")
-    memPort  = RequestPort("Memory side port, sends requests")
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..b6e136dda5
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
+
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    workListEngine = Param.WLEngine("WLEngine object to connect to "
+                    "This MPU")
+    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+                    "This MPU")
+    pushEngine = Param.PushEngine("PushEngine object to connect to "
+                    "This MPU")

From 2c27ec07f402c1c27f2bc8431d033ffe1d8d852e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:12:50 -0800
Subject: [PATCH 036/279] Sperating WLEngine and BaseWLEngine + few changes in
 BaseApplyEngine

---
 .../base/{Apply.py => BaseApplyEngine.py}     |  0
 src/accl/graph/base/BaseWLEngine.py           | 38 ++++++++++++++++++
 .../base/{apply.cc => base_apply_engine.cc}   | 20 +++++-----
 .../base/{apply.hh => base_apply_engine.hh}   | 35 +++++-----------
 .../base/{wl_engine.cc => base_wl_engine.cc}  | 20 +++++-----
 .../base/{wl_engine.hh => base_wl_engine.hh}  | 13 +++---
 src/accl/graph/sega/ApplyEngine.py            | 40 +++++++++++++++++++
 src/accl/graph/{base => sega}/WLEngine.py     | 12 +++---
 src/accl/graph/sega/apply_engine.cc           |  0
 src/accl/graph/sega/apply_engine.hh           |  0
 src/accl/graph/sega/wl_engine.cc              |  0
 src/accl/graph/sega/wl_engine.hh              |  0
 12 files changed, 120 insertions(+), 58 deletions(-)
 rename src/accl/graph/base/{Apply.py => BaseApplyEngine.py} (100%)
 create mode 100644 src/accl/graph/base/BaseWLEngine.py
 rename src/accl/graph/base/{apply.cc => base_apply_engine.cc} (91%)
 rename src/accl/graph/base/{apply.hh => base_apply_engine.hh} (79%)
 rename src/accl/graph/base/{wl_engine.cc => base_wl_engine.cc} (91%)
 rename src/accl/graph/base/{wl_engine.hh => base_wl_engine.hh} (93%)
 create mode 100644 src/accl/graph/sega/ApplyEngine.py
 rename src/accl/graph/{base => sega}/WLEngine.py (84%)
 create mode 100644 src/accl/graph/sega/apply_engine.cc
 create mode 100644 src/accl/graph/sega/apply_engine.hh
 create mode 100644 src/accl/graph/sega/wl_engine.cc
 create mode 100644 src/accl/graph/sega/wl_engine.hh

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/Apply.py
rename to src/accl/graph/base/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
new file mode 100644
index 0000000000..7384e876ef
--- /dev/null
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseWLEngine(ClockedObject):
+    type = 'BaseWLEngine'
+    cxx_header = "accl/base_wl_engine.hh"
+    cxx_class = 'gem5::BaseWLEngine'
+
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/apply.cc b/src/accl/graph/base/base_apply_engine.cc
similarity index 91%
rename from src/accl/graph/base/apply.cc
rename to src/accl/graph/base/base_apply_engine.cc
index eae9c2fd16..c88d14a2c2 100644
--- a/src/accl/graph/base/apply.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.hh"
+#include "accl/base_apply_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-Apply::Apply(const ApplyParams &params):
+BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
@@ -45,7 +45,7 @@ Apply::Apply(const ApplyParams &params):
 {}
 
 Port &
-Apply::getPort(const std::string &if_name, PortID idx)
+BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,20 +64,20 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+BaseApplyEngine::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-bool Apply::handleWL(PacketPtr pkt){
+bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -91,7 +91,7 @@ bool Apply::handleWL(PacketPtr pkt){
     return true;
 }
 
-void Apply::processNextApplyCheckEvent(){
+void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
@@ -114,7 +114,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacketPtr pkt)
+BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
@@ -132,7 +132,7 @@ Apply::handleMemResp(PacketPtr pkt)
 }
 
 void
-Apply::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent(){
     auto queue = applyWriteQueue;
         PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
diff --git a/src/accl/graph/base/apply.hh b/src/accl/graph/base/base_apply_engine.hh
similarity index 79%
rename from src/accl/graph/base/apply.hh
rename to src/accl/graph/base/base_apply_engine.hh
index a3f0ff5aa3..c2d2f26387 100644
--- a/src/accl/graph/base/apply.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,18 +32,16 @@
 #include <queue>
 #include <unordered_map>
 
-#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/Apply.hh"
+#include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class Apply : public ClockedObject
+class BaseApplyEngine : public ClockedObject
 {
   private:
     //FIXME: Remove queue defenition from here.
@@ -75,21 +73,20 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
-        Apply *owner;
+        BaseApplyEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyMemPort(const std::string& name, Apply* owner):
+        MemPort(const std::string& name, BaseApplyEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
         void sendPacket(PacketPtr pkt);
-        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -99,7 +96,7 @@ class Apply : public ClockedObject
 
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
+    MemPort memPort;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -107,29 +104,15 @@ class Apply : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     bool handleWL(PacketPtr pkt);
-    // bool sendPacket();
-    // //one queue for write and one for read a priotizes write over read
-    // void readApplyBuffer();
-    bool handleMemResp(PacketPtr resp);
-    // void writePushBuffer();
-
-    //Events
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
+
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
 
   public:
-    Apply(const ApplyParams &apply);
+    BaseApplyEngine(const ApplyParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/base/wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
similarity index 91%
rename from src/accl/graph/base/wl_engine.cc
rename to src/accl/graph/base/base_wl_engine.cc
index dc8f1dd744..7261069c17 100644
--- a/src/accl/graph/base/wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/wl_engine.hh"
+#include "accl/graph/base/base_wl_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
@@ -45,7 +45,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 {}
 
 Port &
-WLEngine::getPort(const std::string &if_name, PortID idx)
+BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,7 +64,7 @@ WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+BaseWLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -74,13 +74,13 @@ WLEngine::WLMemPort::recvReqRetry()
 }
 
 bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 bool
-WLEngine::handleWLUpdate(PacketPtr pkt){
+BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -94,7 +94,7 @@ WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-void WLEngine::processNextWLReadEvent(){
+void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
     while (!queue.empty()){ //create a map instead of front
         PacketPtr pkt = queue.front();
@@ -117,7 +117,7 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacketPtr pkt)
+BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
@@ -134,7 +134,7 @@ WLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-WLEngine::processNextWLReduceEvent(){
+BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
     auto applyPort = reqPort;
diff --git a/src/accl/graph/base/wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
similarity index 93%
rename from src/accl/graph/base/wl_engine.hh
rename to src/accl/graph/base/base_wl_engine.hh
index 3654999b70..2095a20f1b 100644
--- a/src/accl/graph/base/wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/WLEngine.hh"
+#include "params/BaseWLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-class WLEngine : public ClockedObject
+class BaseWLEngine : public ClockedObject
 {
   private:
     //FIXME: Change this
@@ -77,7 +77,7 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
         WLEngine *owner;
@@ -85,7 +85,7 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, WLEngine* owner):
+        MemPort(const std::string& name, WLEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -97,8 +97,7 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    WLMemPort memPort;
-
+    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
@@ -122,7 +121,7 @@ class WLEngine : public ClockedObject
     */
 
    public:
-    WLEngine(const WLEngineParams &params);
+    BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
new file mode 100644
index 0000000000..0d03e71e54
--- /dev/null
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from build.NULL.python.m5.proxy import Parent
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseApplyEngine import BaseApplyEngine
+
+class ApplyEngine(BaseApplyEngine):
+    type = 'ApplyEngine'
+    cxx_header = "accl/graph/sega/apply_engine.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/sega/WLEngine.py
similarity index 84%
rename from src/accl/graph/base/WLEngine.py
rename to src/accl/graph/sega/WLEngine.py
index deaee20935..a8f3bd20ea 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,14 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseWLEngine import BaseWLEngine
 
-class WLEngine(ClockedObject):
+class WLEngine(BaseWLEngine):
     type = 'WLEngine'
-    cxx_header = "accl/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::MPU'
 
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From 73a55cd8781e57bea1b40b150b9274d25f62a6d3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:25:17 -0800
Subject: [PATCH 037/279] Restructuring classes.

---
 src/accl/graph/base/BasePushEngine.py         |  37 ++++++
 src/accl/graph/base/SConscript                |   4 +-
 .../{push_engine.cc => base_push_engine.cc}   | 125 +++++-------------
 .../{push_engine.hh => base_push_engine.hh}   |  66 ++-------
 src/accl/graph/sega/MPU.py                    |   2 +-
 src/accl/graph/{base => sega}/PushEngine.py   |  14 +-
 src/accl/graph/sega/push_engine.cc            |   0
 src/accl/graph/sega/push_engine.hh            |   0
 8 files changed, 90 insertions(+), 158 deletions(-)
 create mode 100644 src/accl/graph/base/BasePushEngine.py
 rename src/accl/graph/base/{push_engine.cc => base_push_engine.cc} (77%)
 rename src/accl/graph/base/{push_engine.hh => base_push_engine.hh} (66%)
 rename src/accl/graph/{base => sega}/PushEngine.py (83%)
 create mode 100644 src/accl/graph/sega/push_engine.cc
 create mode 100644 src/accl/graph/sega/push_engine.hh

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
new file mode 100644
index 0000000000..6ed5d25978
--- /dev/null
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BasePushEngine(ClockedObject):
+    type = 'BasePushEngine'
+    cxx_header = "accl/graph/base/base_push_engine.hh"
+    cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 18ac71eb7d..a881fa1e6e 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,10 @@
 Import('*')
 
 SimObject('Apply.py')
-SimObject('PushEngine.py')
+SimObject('BasePushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('push_engine.cc')
+Source('base_push_engine.cc')
 Source('wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/push_engine.cc b/src/accl/graph/base/base_push_engine.cc
similarity index 77%
rename from src/accl/graph/base/push_engine.cc
rename to src/accl/graph/base/base_push_engine.cc
index 125433653b..9fbc89221f 100644
--- a/src/accl/graph/base/push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -26,18 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/push_engine.hh"
+#include "accl/graph/base/base_push_engine.hh"
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
-    respPort(name() + ".respPort", this),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
@@ -50,21 +47,29 @@ PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
 }
 
 Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
+BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
+RequestorID
+BasePushEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BasePushEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
-PushEngine::startup()
+BasePushEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
@@ -99,75 +104,14 @@ PushEngine::startup()
 
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleUpdate(pkt);
-}
-
-Tick
-PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-PushEngine::PushRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 bool
-PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -179,7 +123,7 @@ PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-PushEngine::PushMemPort::recvReqRetry()
+BasePushEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -191,20 +135,8 @@ PushEngine::PushMemPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-PushEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
-PushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -223,7 +155,8 @@ PushEngine::handleUpdate(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextReceiveEvent()
+void
+BasePushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
     uint8_t* data = updatePkt->getPtr<uint8_t>();
@@ -274,7 +207,8 @@ void PushEngine::processNextReceiveEvent()
     }
 }
 
-void PushEngine::processNextReadEvent()
+void
+BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
     if (!memPort.blocked()) {
@@ -288,7 +222,7 @@ void PushEngine::processNextReadEvent()
 }
 
 bool
-PushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -321,7 +255,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextSendEvent()
+void
+BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
     if (!reqPort.blocked()) {
diff --git a/src/accl/graph/base/push_engine.hh b/src/accl/graph/base/base_push_engine.hh
similarity index 66%
rename from src/accl/graph/base/push_engine.hh
rename to src/accl/graph/base/base_push_engine.hh
index fbb7d6915a..591f4ab734 100644
--- a/src/accl/graph/base/push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,67 +31,27 @@
 
 #include <queue>
 
-#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/PushEngine.hh"
+#include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class PushEngine : public ClockedObject
+class BasePushEngine : public ClockedObject
 {
   private:
 
-    class PushRespPort : public ResponsePort
+    class MemPort : public RequestPort
     {
       private:
-        PushEngine* owner;
-
-      public:
-        PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class PushReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
+        BasePushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        PushReqPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class PushMemPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        PushMemPort(const std::string& name, PushEngine* owner):
+        MemPort(const std::string& name, PushEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -106,13 +66,9 @@ class PushEngine : public ClockedObject
 
     virtual void startup() override;
 
-    System* const system;
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
-    PushReqPort reqPort;
-    PushRespPort respPort;
-
-    PushMemPort memPort;
+    MemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
@@ -128,9 +84,6 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
     bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -144,11 +97,14 @@ class PushEngine : public ClockedObject
 
   public:
 
-    PushEngine(const PushEngineParams &params);
+    BasePushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
+
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index b6e136dda5..923c1a2f38 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,7 +28,7 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
+
 from m5.objects.WLEngine import WLEngine
 from m5.objects.PushEngine import PushEngine
 from m5.objects.ApplyEngine import ApplyEngine
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/sega/PushEngine.py
similarity index 83%
rename from src/accl/graph/base/PushEngine.py
rename to src/accl/graph/sega/PushEngine.py
index 7fef165169..fa9d921a26 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -29,9 +29,13 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class PushEngine(ClockedObject):
-    type = 'PushEngine'
-    cxx_header = "accl/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
 
-    memPort  = RequestPort("Port to communicate with the memory")
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From bb54ba51017c68470ac4a897fb73e4210e48ff6a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:34:28 -0800
Subject: [PATCH 038/279] Adding RequestorID

---
 src/accl/graph/base/base_apply_engine.cc | 13 +++++++++++++
 src/accl/graph/base/base_apply_engine.hh |  3 +++
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++++
 src/accl/graph/base/base_wl_engine.hh    |  4 ++++
 4 files changed, 33 insertions(+)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index c88d14a2c2..111ea16f2e 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -54,6 +55,18 @@ BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseApplyEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseApplyEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index c2d2f26387..3304e58a92 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -116,6 +116,9 @@ class BaseApplyEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7261069c17..dec37636ba 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -54,6 +55,18 @@ BaseWLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseWLEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseWLEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 2095a20f1b..a63d9b1ef7 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -97,6 +97,7 @@ class BaseWLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
+    RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
@@ -125,6 +126,9 @@ class BaseWLEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }

From 3f666defc9ceb76342d4c5383a8edeca00a3e5cd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 13:01:19 -0800
Subject: [PATCH 039/279] Definining MPU interfaces.

---
 src/accl/graph/base/base_push_engine.cc |  35 +----
 src/accl/graph/base/base_push_engine.hh |  24 ----
 src/accl/graph/base/base_wl_engine.hh   |   1 +
 src/accl/graph/sega/mpu.cc              | 183 ++++++++++++++++++++++++
 src/accl/graph/sega/mpu.hh              | 134 +++++++++++++++++
 src/mem/packet.hh                       |   3 +
 6 files changed, 322 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/graph/sega/mpu.cc

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 9fbc89221f..c4388cab4b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -49,11 +49,7 @@ BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(par
 Port &
 BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -104,36 +100,7 @@ BasePushEngine::startup()
 
 }
 
-bool
-BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
 
-void
-BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BasePushEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
 
 bool
 BasePushEngine::handleUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 591f4ab734..2265bb32db 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,34 +42,10 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
-
-    class MemPort : public RequestPort
-    {
-      private:
-        BasePushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    MemPort memPort;
-
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a63d9b1ef7..3a683bb6e4 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -105,6 +105,7 @@ class BaseWLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    //FIXME: make handleWLUpdate public
     bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..c45ad78ef9
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+void
+MPU::startup()
+{
+    if (((int16_t) applyEngine->getRequestorId) == -1) {
+        applyEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) pushEngine->getRequestorId) == -1) {
+        pushEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) wlEngine->getRequestorId) == -1) {
+        wlEngine->setRequestorId(nextRequestorId++);
+    }
+}
+
+AddrRangeList
+MPU::MPURespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+Tick
+MPU::MPURespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::MPURespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::MPURespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::MPUReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::MPUReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+MPU::MPUMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+MPU::MPUMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+AddrRangeList
+MPU::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+MPU::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isUpdateWL()) {
+        panic("Functional requests should not be made to WL.")
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        memPort.recvFuctional(pkt);
+    }
+}
+
+bool
+MPU::handleMemReq(PacketPtr pkt)
+{
+    return memPort.recvTimingReq(pkt);
+}
+
+void
+MPU::handleMemResp(PacketPtr pkt)
+{
+    //TODO: Implement this;
+}
+
+bool
+MPU::recvWLNotif(WorkListItem wl)
+{
+    return applyEngine->recvWLUpdate(wl);
+}
+
+bool
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+}
+
+bool
+MPU::recvPushUpdate(PacketPtr pkt)
+{
+    // TODO: Implement this Mahyar
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index e69de29bb2..bc4ba5d53b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class MPU : public ClockedObject
+{
+  private:
+    class MPURespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+
+      public:
+        MPURespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class MPUReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MPUReqPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class MPUMemPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MemPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    virtual void startup();
+
+    RequestorID nextRequestorId;
+
+    MPURespPort respPort;
+    MPUReqPort reqPort;
+    MPUMemPort memPort;
+
+    ApplyEngine* applyEngine;
+    PushEngine* pushEngine;
+    WLEngine* wlEngine;
+
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool recvWLNotif(WorkListItem wl);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvPushUpdate(PacketPtr pkt);
+
+  public:
+
+    MPU(const MPUParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+}
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 69686e7835..69078fe8f1 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -179,6 +179,7 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
+        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -268,6 +269,8 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
+    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
+
     Command
     responseCommand() const
     {

From bf317d691f124a86cf66e8c7a67a71b32c2fa5a4 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 15:46:10 -0800
Subject: [PATCH 040/279] Adding changes to ApplyEngine and WLEngine

---
 src/accl/graph/base/base_apply_engine.hh | 28 ++++--------------------
 src/accl/graph/base/base_wl_engine.hh    | 26 +++++-----------------
 2 files changed, 9 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 3304e58a92..d603cb2713 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -73,31 +73,8 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseApplyEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, BaseApplyEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked(){ return _blocked;}
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     const RequestorID requestorId;
 
-    MemPort memPort;
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -106,11 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-
+    //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+
   public:
     BaseApplyEngine(const ApplyParams &apply);
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3a683bb6e4..0530c64c72 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -77,26 +77,6 @@ class BaseWLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
@@ -113,6 +93,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+   //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
@@ -121,8 +102,11 @@ class BaseWLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual void sendApplyReq(WorkListItem wl) = 0;
 
-   public:
+  public:
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,

From f3e720658578d1447eb52657b4e544432e27255b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 22:39:08 -0800
Subject: [PATCH 041/279] Finished restructured for ApplyE and WLE,
 pre-compiled

---
 src/accl/graph/base/BaseApplyEngine.py   |  9 +--
 src/accl/graph/base/SConscript           |  8 +-
 src/accl/graph/base/base_apply_engine.cc | 94 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 ++--
 src/accl/graph/base/base_wl_engine.cc    | 78 +++++---------------
 src/accl/graph/base/base_wl_engine.hh    | 17 ++---
 src/accl/graph/sega/SConscript           | 37 ++++++++++
 src/accl/graph/sega/apply_engine.cc      | 48 ++++++++++++
 src/accl/graph/sega/apply_engine.hh      | 54 ++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 50 +++++++++++++
 src/accl/graph/sega/wl_engine.hh         | 57 ++++++++++++++
 11 files changed, 321 insertions(+), 144 deletions(-)
 create mode 100644 src/accl/graph/sega/SConscript

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 80aa430139..23fdfbb08a 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -29,10 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class Apply(ClockedObject):
-    type = 'Apply'
-    cxx_header = "accl/apply.hh"
-    cxx_class = 'gem5::Apply'
+class BaseApplyEngine(ClockedObject):
+    type = 'BaseApplyEngine'
+    cxx_header = "accl/base_apply_engine.hh"
+    cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index a881fa1e6e..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,11 +27,11 @@
 
 Import('*')
 
-SimObject('Apply.py')
+SimObject('BaseApplyEngine.py')
 SimObject('BasePushEngine.py')
-SimObject('WLEngine.py')
+SimObject('BaseWLEngine.py')
 
-Source('apply.cc')
+Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
-Source('wl_engine.cc')
+Source('base_wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 111ea16f2e..805a7649b7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
 Port &
 BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
         return SimObject::getPort(if_name, idx);
-    }
 }
 
 RequestorID
@@ -67,29 +62,6 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-BaseApplyEngine::ApplyMemPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
 bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
@@ -106,19 +78,19 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if (!memPort.blocked()){
-        PacketPtr pkt = queue.front();
-        if (queue.sendPktRetry && !queue.blocked()){
-                // respPort.trySendRetry();
-                queue.sendPktRetry = false;
-        }
-        // conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+    // if (!memPort.blocked()){
+    PacketPtr pkt = queue.front();
+    // if (queue.sendPktRetry && !queue.blocked()){
+    //         // respPort.trySendRetry();
+    //         queue.sendPktRetry = false;
+    // }
+    // conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -157,26 +129,26 @@ BaseApplyEngine::processNextApplyEvent(){
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort.blocked() && !reqPort.blocked()){
-                //update prop with temp_prop
-                if(prop < temp_prop){
-                    wl.prop = prop;
-                }else{
-                    wl.prop = temp_prop;
-                }
-                //write back the new worklist item to  memory
-                uint8_t* wList = workListToMemory(wl);
-                memcpy(data + request_offset, wList, sizeof(WorkListItem));
-                //Create memory write requests.
-                PacketPtr writePkt  =
-                getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort.sendPacket(writePkt);
-                reqPort.sendPacket(writePkt);
+            // if (!memPort.blocked() && !reqPort.blocked()){
+            //update prop with temp_prop
+            if(prop < temp_prop){
+                wl.prop = prop;
+            }else{
+                wl.prop = temp_prop;
+            }
+            //write back the new worklist item to  memory
+            uint8_t* wList = workListToMemory(wl);
+            memcpy(data + request_offset, wList, sizeof(WorkListItem));
+            //Create memory write requests.
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            if (parent.sendMemReq(writePkt) &&
+                parent.recvApplyNotif(WorkListItem.prop,
+                                      WorkListItem.degree,
+                                      WorkListItem.edgeIndex)){
                 queue.pop();
-                if (queue.sendPktRetry && !queue.blocked()){
-                    // memPort.trySendRetry();
-                    queue.sendPktRetry = false;
-                }
+                // memPort.trySendRetry();
+                // queue.sendPktRetry = false;
             }
         }else{
             queue.applyQueue.pop();
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index d603cb2713..27d906f060 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_BASEAPPLY_HH__
+#define __ACCL_BASEAPPLY_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -83,13 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const ApplyParams &apply);
@@ -103,4 +104,4 @@ class BaseApplyEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_APPLY_HH__
+#endif // __BASEACCL_APPLY_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index dec37636ba..4af6f5e326 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
 Port &
 BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -67,31 +62,6 @@ BaseWLEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BaseWLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 bool
 BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
@@ -109,20 +79,16 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt){
 
 void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    while (!queue.empty()){ //create a map instead of front
-        PacketPtr pkt = queue.front();
-        /// conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request =
-            std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        if (!memPort.blocked()){
-            queue.pop();
-            memPort.sendPacket(memPkt);
-            break;
-        }
+    PacketPtr pkt = queue.front();
+    /// conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request =
+        std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq()){
+        queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -150,7 +116,6 @@ void
 BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    auto applyPort = reqPort;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
@@ -164,17 +129,16 @@ BaseWLEngine::processNextWLReduceEvent(){
         if(*value < temp_prop){
             temp_prop = *value;
         }
-        if (!memPort.blocked() && !applyPort.blocked()){
-            wl.temp_prop = temp_prop;
-            uint8_t* wlItem = workListToMemory(wl);
-            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort.sendPacket(writePkt);
-            applyPort.sendPacket(writePkt);
+        // if (!memPort.blocked() && !applyPort.blocked()){
+        wl.temp_prop = temp_prop;
+        uint8_t* wlItem = workListToMemory(wl);
+        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (parent.sendMemReq(writePkt) &&
+            parent.sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
@@ -187,12 +151,10 @@ BaseWLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 0530c64c72..1d0f3e33c1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLE_HH__
-#define __ACCL_WLE_HH__
+#ifndef __ACCL_BASEWLENGINE_HH__
+#define __ACCL_BASEWLENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -78,23 +78,19 @@ class BaseWLEngine : public ClockedObject
     };
 
     RequestorID requestorId;
-    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
-    //FIXME: make handleWLUpdate public
-    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-   //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -103,8 +99,8 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
-    virtual void sendApplyReq(WorkListItem wl) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendWLNotif(WorkListItem wl) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -114,8 +110,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorId requestorId);
+    bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_WLE_HH__
+#endif // __ACCL_BASEWLENGINE_HH__
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..79afe3b7d0
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('WLEngine.py')
+
+Source('apply_engine.cc')
+Source('mpu.cc')
+Source('push_engine.cc')
+Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index e69de29bb2..41a568bd27 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/apply_engine.hh"
+
+namespace gem5{
+
+ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+    BaseApplyEngine(params)
+{}
+
+virtual bool
+ApplyEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+virtual bool
+ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+    mpu->recvApplyNotif(prop, degree, edgeIndex);
+
+}
+
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index e69de29bb2..fd2bca008f 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_apply_engine.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/ApplyEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+
+namespace gem5
+{
+
+class ApplyEngine : public BaseApplyEngine
+{
+  private:
+    MPU mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+  public:
+    ApplyEngine(const ApplyEngineParams &params);
+}
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e69de29bb2..9608d0cbc4 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include <string>
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params)
+{}
+
+virtual bool
+WLEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+// FIXME: handle the case where Apply queue is full
+virtual bool
+WLEngine::sendWLNotif(WorkListItem wl){
+    mpu->recvWLNotif(wl);
+    return true;
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index e69de29bb2..eee6b1f22f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLENGINE_HH__
+#define __ACCL_WLENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/WLEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+
+namespace gem5
+{
+
+class WLEngine : public BaseWorkListEngine
+{
+  private:
+    MPU* mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool sendWLNotif(WorkListItem wl);
+  public:
+    WLEngine(const WLEngineParams &params);
+}
\ No newline at end of file

From a0736d52575b8d28dd5641827ee86a7de096c642 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:31:49 -0800
Subject: [PATCH 042/279] Finished restructure for PushEngine. Pre-compile.

---
 src/accl/graph/base/base_push_engine.cc | 30 +++++--------
 src/accl/graph/base/base_push_engine.hh | 19 +++++++-
 src/accl/graph/sega/mpu.cc              | 29 +++++++++++--
 src/accl/graph/sega/mpu.hh              |  2 +-
 src/accl/graph/sega/push_engine.cc      | 58 +++++++++++++++++++++++++
 src/accl/graph/sega/push_engine.hh      | 55 +++++++++++++++++++++++
 6 files changed, 169 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index c4388cab4b..6871154276 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,7 +33,8 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+    ClockedObject(params),
     requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
@@ -103,7 +104,8 @@ BasePushEngine::startup()
 
 
 bool
-BasePushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::recvApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edge_index)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -115,7 +117,7 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt);
+    notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -125,21 +127,15 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
 void
 BasePushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.front();
-    uint8_t* data = updatePkt->getPtr<uint8_t>();
-
-    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
-    uint32_t edge_index = *((uint32_t *)data);
-    uint32_t degree = *((uint32_t *)(data + 4));
-    uint32_t value = *((uint32_t *)(data + 8));
+    ApplyNotif notif = notifQueue.front();
 
     std::vector<Addr> addr_queue;
     std::vector<Addr> offset_queue;
     std::vector<int> num_edge_queue;
 
-    for (uint32_t index = 0; index < degree; index++) {
+    for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
@@ -164,10 +160,10 @@ BasePushEngine::processNextReceiveEvent()
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = value;
+        reqValueMap[pkt->req] = notif.prop;
     }
 
-    vertexQueue.pop();
+    notifQueue.pop();
 
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
@@ -178,8 +174,7 @@ void
 BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
-    if (!memPort.blocked()) {
-        memPort.sendPacket(pkt);
+    if (!sendMemReq(pkt)) {
         memReqQueue.pop();
     }
 
@@ -226,8 +221,7 @@ void
 BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
+    if (!sendPushUpdate(pkt)) {
         updateQueue.pop();
     }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 2265bb32db..63ad3a6652 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,11 +42,22 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
+
+    struct ApplyNotif {
+        uint32_t prop;
+        uint32_t degree;
+        uint32_t edgeIndex;
+
+        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
+        prop(prop), degree(degree), edgeIndex(edge_index)
+        {}
+    };
+
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    std::queue<PacketPtr> vertexQueue;
+    std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
 
@@ -60,7 +71,7 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool handleUpdate(PacketPtr pkt);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
@@ -71,6 +82,10 @@ class BasePushEngine : public ClockedObject
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+
   public:
 
     BasePushEngine(const PushEngineParams &params);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c45ad78ef9..09ab23a835 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -161,7 +161,16 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    //TODO: Implement this;
+    RequestorID requestorId = pkt->requestorId();
+    if (applyEngine->getRequestorId() == requestorId) {
+        applyEngine->handleMemResp(pkt);
+    } else if (pushEngine->getRequestorId() == requestorId) {
+        pushEngine->handleMemResp(pkt);
+    } else if (wlEngine->getRequestorId() == requestorId) {
+        wlEngine->handleMemResp(pkt);
+    } else {
+        panic("Received a response with an unknown requestorId.");
+    }
 }
 
 bool
@@ -173,11 +182,25 @@ MPU::recvWLNotif(WorkListItem wl)
 bool
 MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
-    // TODO: Implement this Mahyar
+    Addr addr = pkt->getAddr();
+    for (auto addr_range: memPort.getAddrRangeList()) {
+        if (addr_range.contains(addr)) {
+            if (!memPort.sendPacket(pkt)) {
+                return false;
+            }
+            return true;
+        }
+    }
+
+    if (!reqPort.sendPacket(pkt)) {
+        return false;
+    }
+    return true;
+
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index bc4ba5d53b..93d1dd8bb3 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -120,7 +120,7 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e69de29bb2..e43512c6f4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngine &params) :
+    BasePushEngine(params),
+    owner(params.mpu)
+{
+}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    return SimObject::getPort(if_name, idx);
+}
+
+bool
+PushEngine::sendMemReq(PacketPtr)
+{
+    return owner->handleMemReq(pkt);
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    return owner->recvPushUpdate(pkt);
+}
+
+}
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e69de29bb2..54ef72d5f9 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+
+namespace gem5
+{
+class PushEngine : public BasePushEngine
+{
+  private:
+    MPU* owner;
+
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt);
+
+  public:
+    PushEngine(const PushEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file

From d700e452fa4963ee307185eed8fdcc80f453ec4c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:51:02 -0800
Subject: [PATCH 043/279] Debugging.

---
 src/accl/graph/base/base_apply_engine.cc | 31 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 +++++-----
 src/accl/graph/base/base_push_engine.hh  |  9 ++++---
 src/accl/graph/base/base_wl_engine.cc    |  6 ++---
 src/accl/graph/base/base_wl_engine.hh    |  9 ++++---
 src/accl/graph/base/util.cc              |  2 +-
 src/accl/graph/sega/ApplyEngine.py       |  7 ++----
 src/accl/graph/sega/MPU.py               |  6 ++---
 src/accl/graph/sega/PushEngine.py        | 16 +++++-------
 src/accl/graph/sega/SConscript           |  1 +
 src/accl/graph/sega/WLEngine.py          |  7 ++----
 src/accl/graph/sega/apply_engine.cc      |  6 ++---
 src/accl/graph/sega/apply_engine.hh      | 10 +++++---
 src/accl/graph/sega/mpu.cc               | 11 ++++++---
 src/accl/graph/sega/mpu.hh               |  5 ++++
 src/accl/graph/sega/wl_engine.hh         |  9 ++++---
 16 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 805a7649b7..301f5931bf 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base_apply_engine.hh"
+#include "accl/graph/base/base_apply_engine.hh"
 
 #include <string>
 
@@ -90,7 +90,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq(memPkt)){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -98,22 +98,13 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     }
 }
 
-bool
+void
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = applyWriteQueue;
-
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else
-            queue.push(pkt);
-
-        if(!nextApplyEvent.scheduled()){
-            schedule(nextApplyEvent, nextCycle());
-        }
-        return true;
-    return true;
+    // FIXME: change the event, remove the retry parts
+    if(!nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
+    }
 }
 
 void
@@ -142,10 +133,10 @@ BaseApplyEngine::processNextApplyEvent(){
             //Create memory write requests.
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (parent.sendMemReq(writePkt) &&
-                parent.recvApplyNotif(WorkListItem.prop,
-                                      WorkListItem.degree,
-                                      WorkListItem.edgeIndex)){
+            if (sendMemReq(writePkt) &&
+                recvApplyNotif(wl.prop,
+                                wl.degree,
+                                wl.edgeIndex)){
                 queue.pop();
                 // memPort.trySendRetry();
                 // queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 27d906f060..56b43cfb7b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,14 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEAPPLY_HH__
-#define __ACCL_BASEAPPLY_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
@@ -73,7 +74,7 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -93,15 +94,15 @@ class BaseApplyEngine : public ClockedObject
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
-    BaseApplyEngine(const ApplyParams &apply);
+    BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 };
 
 }
 
-#endif // __BASEACCL_APPLY_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 63ad3a6652..873cb26b3d 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_PUSH_ENGINE_HH__
-#define __ACCL_PUSH_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
 
 #include <queue>
 
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
@@ -94,10 +95,10 @@ class BasePushEngine : public ClockedObject
                 PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 
 };
 
 }
 
-#endif // __ACCL_PUSH_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 4af6f5e326..b863b38e19 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -87,7 +87,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq()){
+    if (sendMemReq()){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -135,8 +135,8 @@ BaseWLEngine::processNextWLReduceEvent(){
         memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (parent.sendMemReq(writePkt) &&
-            parent.sendWLNotif(writePkt)) {
+        if (sendMemReq(writePkt) &&
+            sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 1d0f3e33c1..3d807d8b06 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEWLENGINE_HH__
-#define __ACCL_BASEWLENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -109,10 +110,10 @@ class BaseWLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
     bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_BASEWLENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 40a1fc761b..0baa374714 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 0d03e71e54..bb43836ff7 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseApplyEngine import BaseApplyEngine
 
 class ApplyEngine(BaseApplyEngine):
     type = 'ApplyEngine'
     cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 923c1a2f38..046dfaf4e8 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -29,9 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+# from m5.objects.WLEngine import WLEngine
+# from m5.objects.PushEngine import PushEngine
+# from m5.objects.ApplyEngine import ApplyEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index fa9d921a26..eb0eed18ab 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,15 +27,11 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BasePushEngine import BasePushEngine
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+class PushEngine(BasePushEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
 
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 79afe3b7d0..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -29,6 +29,7 @@ Import('*')
 
 SimObject('ApplyEngine.py')
 SimObject('MPU.py')
+SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a8f3bd20ea..12fbcf9b4f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseWLEngine import BaseWLEngine
 
 class WLEngine(BaseWLEngine):
     type = 'WLEngine'
     cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
+    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 41a568bd27..64ae71e290 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -30,16 +30,16 @@
 
 namespace gem5{
 
-ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
     BaseApplyEngine(params)
 {}
 
-virtual bool
+bool
 ApplyEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
-virtual bool
+bool
 ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index fd2bca008f..855ebbd8b0 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -45,10 +45,14 @@ namespace gem5
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-    MPU mpu;
+    MPU* mpu;
   protected:
     virtual bool sendMemReq(PacketPtr pkt);
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
   public:
     ApplyEngine(const ApplyEngineParams &params);
+};
+
 }
+
+#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 09ab23a835..27f7c8e314 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -28,16 +28,19 @@
 
 #include "accl/graph/sega/mpu.hh"
 
+namespace gem5
+{
+
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId) == -1) {
+    if (((int16_t) applyEngine->getRequestorId()) == -1) {
         applyEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) pushEngine->getRequestorId) == -1) {
+    if (((int16_t) pushEngine->getRequestorId()) == -1) {
         pushEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) wlEngine->getRequestorId) == -1) {
+    if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
 }
@@ -204,3 +207,5 @@ MPU::recvPushUpdate(PacketPtr pkt)
     return true;
 
 }
+
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 93d1dd8bb3..b37821c200 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -39,6 +39,9 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+namespace gem5
+{
+
 class MPU : public ClockedObject
 {
   private:
@@ -129,6 +132,8 @@ class MPU : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+};
+
 }
 
 #endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index eee6b1f22f..938128e05f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLENGINE_HH__
-#define __ACCL_WLENGINE_HH__
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -54,4 +54,7 @@ class WLEngine : public BaseWorkListEngine
     virtual bool sendWLNotif(WorkListItem wl);
   public:
     WLEngine(const WLEngineParams &params);
-}
\ No newline at end of file
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file

From 99ada0fd1251304d6df5d09ac3c52a5193f57bc9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:40:15 -0800
Subject: [PATCH 044/279] Lots of debugging.

---
 src/accl/graph/base/BaseApplyEngine.py   |   2 +-
 src/accl/graph/base/BasePushEngine.py    |   1 -
 src/accl/graph/base/BaseWLEngine.py      |   4 +-
 src/accl/graph/base/base_apply_engine.cc | 105 ++++++++++------------
 src/accl/graph/base/base_apply_engine.hh |  40 ++-------
 src/accl/graph/base/base_push_engine.cc  |  45 +---------
 src/accl/graph/base/base_push_engine.hh  |  10 +--
 src/accl/graph/base/base_wl_engine.cc    |   6 +-
 src/accl/graph/base/base_wl_engine.hh    |   6 +-
 src/accl/graph/sega/MPU.py               |   6 +-
 src/accl/graph/sega/apply_engine.cc      |  10 ++-
 src/accl/graph/sega/apply_engine.hh      |   9 +-
 src/accl/graph/sega/mpu.cc               | 107 +++++++++++++++++++----
 src/accl/graph/sega/mpu.hh               |  20 ++---
 src/accl/graph/sega/push_engine.cc       |  11 +--
 src/accl/graph/sega/push_engine.hh       |  12 ++-
 src/accl/graph/sega/wl_engine.cc         |  19 ++--
 src/accl/graph/sega/wl_engine.hh         |  13 ++-
 src/mem/packet.hh                        |   3 -
 19 files changed, 217 insertions(+), 212 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 23fdfbb08a..45d94b3fd2 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/base_apply_engine.hh"
+    cxx_header = "accl/graph/base/base_apply_engine.hh"
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 6ed5d25978..891221c06d 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,4 +34,3 @@ class BasePushEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
 
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7384e876ef..3ecf030138 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -31,8 +31,8 @@
 
 class BaseWLEngine(ClockedObject):
     type = 'BaseWLEngine'
-    cxx_header = "accl/base_wl_engine.hh"
+    cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 301f5931bf..731cd5c345 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -38,8 +38,7 @@ namespace gem5
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     requestorId(-1),
-    applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize),
+    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
@@ -62,14 +61,14 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-bool BaseApplyEngine::handleWL(PacketPtr pkt){
-    auto queue = applyReadQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else{
-        queue.push(pkt);
-    }
+bool BaseApplyEngine::recvWLNotif(Addr addr){
+    // TODO: Investigate the situation where the queue is full.
+    // if (applyReadQueue.size() == queueSize){
+    //     //  applyReadQueue.sendPktRetry = true;
+    //     return true;
+    // } else{
+    applyReadQueue.push(addr);
+    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
@@ -77,78 +76,64 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 }
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
-    auto queue = applyReadQueue;
-    // if (!memPort.blocked()){
-    PacketPtr pkt = queue.front();
-    // if (queue.sendPktRetry && !queue.blocked()){
-    //         // respPort.trySendRetry();
-    //         queue.sendPktRetry = false;
-    // }
-    // conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
+    Addr addr = applyReadQueue.front();
+    Addr req_addr = (addr / 64) * 64;
+    int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
     if (sendMemReq(memPkt)){
-        queue.pop();
+        applyReadQueue.pop();
     }
-    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
-void
+bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     // FIXME: change the event, remove the retry parts
+    applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
+    return true;
 }
 
 void
 BaseApplyEngine::processNextApplyEvent(){
-    auto queue = applyWriteQueue;
-        PacketPtr pkt = queue.front();
-        uint8_t* data = pkt->getPtr<uint8_t>();
+    PacketPtr pkt = applyWriteQueue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr request = pkt->req;
-        int request_offset = requestOffset[request];
-        WorkListItem wl = memoryToWorkList(data + request_offset);
-        uint32_t prop = wl.prop;
-        uint32_t temp_prop = wl.temp_prop;
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    uint32_t prop = wl.prop;
+    uint32_t temp_prop = wl.temp_prop;
 
-        if (temp_prop != prop){
-            // if (!memPort.blocked() && !reqPort.blocked()){
-            //update prop with temp_prop
-            if(prop < temp_prop){
-                wl.prop = prop;
-            }else{
-                wl.prop = temp_prop;
-            }
-            //write back the new worklist item to  memory
-            uint8_t* wList = workListToMemory(wl);
-            memcpy(data + request_offset, wList, sizeof(WorkListItem));
-            //Create memory write requests.
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (sendMemReq(writePkt) &&
-                recvApplyNotif(wl.prop,
-                                wl.degree,
-                                wl.edgeIndex)){
-                queue.pop();
-                // memPort.trySendRetry();
-                // queue.sendPktRetry = false;
-            }
-        }else{
-            queue.applyQueue.pop();
-            if (queue.sendPktRetry && !queue.blocked()){
-                // memPort.trySendRetry();
-                queue.sendPktRetry = false;
-            }
+    if (temp_prop != prop) {
+        // TODO: instead of min add a Reduce function.
+        //update prop with temp_prop
+        if(prop < temp_prop) {
+            wl.prop = prop;
+        }else {
+            wl.prop = temp_prop;
+        }
+        //write back the new worklist item to  memory
+        uint8_t* wList = workListToMemory(wl);
+        memcpy(data + request_offset, wList, sizeof(WorkListItem));
+        //Create memory write requests.
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (sendMemReq(writePkt) &&
+            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+            applyWriteQueue.pop();
         }
-    if(!queue.empty() && !nextApplyEvent.scheduled()){
+    }else {
+        applyWriteQueue.pop();
+    }
+    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 56b43cfb7b..b7c0db90cb 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,53 +45,24 @@ namespace gem5
 class BaseApplyEngine : public ClockedObject
 {
   private:
-    //FIXME: Remove queue defenition from here.
-    struct ApplyQueue{
-        std::queue<PacketPtr> applyQueue;
-        const uint32_t queueSize;
-        bool sendPktRetry;
-
-        bool blocked(){
-            return (applyQueue.size() == queueSize);
-        }
-        bool empty(){
-            return applyQueue.empty();
-        }
-        void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-        }
-
-        void pop(){
-            applyQueue.pop();
-        }
-
-        PacketPtr front(){
-            return applyQueue.front();
-        }
-
-        ApplyQueue(uint32_t qSize):
-          queueSize(qSize)
-        {}
-    };
 
     RequestorID requestorId;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
+    std::queue<Addr> applyReadQueue;
+    std::queue<PacketPtr> applyWriteQueue;
+    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const BaseApplyEngineParams &apply);
@@ -101,6 +72,9 @@ class BaseApplyEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
+    bool recvWLNotif(Addr addr);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6871154276..d93cbdf8da 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,10 +33,9 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -65,44 +64,6 @@ BasePushEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BasePushEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-}
-
-
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
@@ -135,7 +96,7 @@ BasePushEngine::processNextReceiveEvent()
 
     for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 873cb26b3d..c723932975 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -54,8 +54,6 @@ class BasePushEngine : public ClockedObject
         {}
     };
 
-    virtual void startup() override;
-
     RequestorID requestorId;
 
     std::queue<ApplyNotif> notifQueue;
@@ -72,24 +70,22 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
 
   public:
 
-    BasePushEngine(const PushEngineParams &params);
+    BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -97,6 +93,8 @@ class BasePushEngine : public ClockedObject
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
 
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool handleMemResp(PacketPtr pkt);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index b863b38e19..806ab4a6c3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -30,8 +30,6 @@
 
 #include <string>
 
-#include "accl/graph/base/util.hh"
-
 namespace gem5
 {
 
@@ -87,7 +85,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (sendMemReq()){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -136,7 +134,7 @@ BaseWLEngine::processNextWLReduceEvent(){
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
         if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt)) {
+            sendWLNotif(writePkt->getAddr())) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3d807d8b06..a2cab4c7e2 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -91,7 +91,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void handleMemResp(PacketPtr resp);
+
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -101,7 +101,7 @@ class BaseWLEngine : public ClockedObject
     */
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool sendWLNotif(WorkListItem wl) = 0;
+    virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -111,7 +111,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
     bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 046dfaf4e8..68cfb3d42d 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,9 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    workListEngine = Param.WLEngine("WLEngine object to connect to "
+    work_list_engine = Param.WLEngine("WLEngine object to connect to "
                     "This MPU")
-    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
                     "This MPU")
-    pushEngine = Param.PushEngine("PushEngine object to connect to "
+    push_engine = Param.PushEngine("PushEngine object to connect to "
                     "This MPU")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 64ae71e290..bc45850041 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,11 +27,13 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5{
 
-ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
-    BaseApplyEngine(params)
+ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
+    BaseApplyEngine(params),
+    mpu(params.mpu)
 {}
 
 bool
@@ -40,9 +42,9 @@ ApplyEngine::sendMemReq(PacketPtr pkt){
 }
 
 bool
-ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
-
+    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 855ebbd8b0..17e3280cb5 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,14 +42,21 @@
 namespace gem5
 {
 
+class MPU;
+
 class ApplyEngine : public BaseApplyEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+
   public:
+
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 27f7c8e314..4824bcd699 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -31,6 +31,31 @@
 namespace gem5
 {
 
+MPU::MPU(const MPUParams &params):
+    ClockedObject(params),
+    nextRequestorId(0),
+    respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
+    memPort(name() + ".memPort", this),
+    applyEngine(params.apply_engine),
+    pushEngine(params.push_engine),
+    wlEngine(params.work_list_engine)
+{}
+
+Port&
+MPU::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
 void
 MPU::startup()
 {
@@ -43,6 +68,37 @@ MPU::startup()
     if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
+
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
 }
 
 AddrRangeList
@@ -54,7 +110,7 @@ MPU::MPURespPort::getAddrRanges() const
 bool
 MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
 {
-    return wlEngine->handleWLUpdate(pkt);
+    return owner->handleWLUpdate(pkt);
 }
 
 Tick
@@ -106,12 +162,6 @@ MPU::MPUReqPort::recvReqRetry()
     }
 }
 
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 void
 MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 {
@@ -124,6 +174,14 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    owner->handleMemResp(pkt);
+    return true;
+}
+
 void
 MPU::MPUMemPort::recvReqRetry()
 {
@@ -146,19 +204,21 @@ MPU::getAddrRanges()
 void
 MPU::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->isUpdateWL()) {
-        panic("Functional requests should not be made to WL.")
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
         //TODO: Might be a good idea to implement later.
         // wlEngine->recvFunctional(pkt);
     } else {
-        memPort.recvFuctional(pkt);
+        memPort.sendFunctional(pkt);
     }
 }
 
 bool
 MPU::handleMemReq(PacketPtr pkt)
 {
-    return memPort.recvTimingReq(pkt);
+    //TODO: Investigate sending true all the time
+    memPort.sendPacket(pkt);
+    return true;
 }
 
 void
@@ -177,33 +237,42 @@ MPU::handleMemResp(PacketPtr pkt)
 }
 
 bool
-MPU::recvWLNotif(WorkListItem wl)
+MPU::handleWLUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+bool
+MPU::recvWLNotif(Addr addr)
 {
-    return applyEngine->recvWLUpdate(wl);
+    return applyEngine->recvWLNotif(addr);
 }
 
 bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
+    return pushEngine->recvApplyNotif(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRangeList()) {
+    for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (!memPort.sendPacket(pkt)) {
+            if (memPort.blocked()) {
                 return false;
+            } else {
+                memPort.sendPacket(pkt);
+                return true;
             }
-            return true;
         }
     }
 
-    if (!reqPort.sendPacket(pkt)) {
+    if (reqPort.blocked()) {
         return false;
     }
+    reqPort.sendPacket(pkt);
     return true;
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index b37821c200..be5139c0e0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/base/util.hh"
 #include "accl/graph/sega/apply_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
@@ -91,7 +90,7 @@ class MPU : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        MemPort(const std::string& name, MPU* owner):
+        MPUMemPort(const std::string& name, MPU* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -119,21 +118,22 @@ class MPU : public ClockedObject
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
-    bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool recvPushUpdate(PacketPtr pkt);
-
   public:
 
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool handleWLUpdate(PacketPtr pkt);
+    bool recvWLNotif(Addr addr);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool recvPushUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e43512c6f4..922ae32ed2 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,13 +27,14 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngine &params) :
+PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    owner(params.mpu)
+    mpu(params.mpu)
 {
 }
 
@@ -44,15 +45,15 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 bool
-PushEngine::sendMemReq(PacketPtr)
+PushEngine::sendMemReq(PacketPtr pkt)
 {
-    return owner->handleMemReq(pkt);
+    return mpu->handleMemReq(pkt);
 }
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return owner->recvPushUpdate(pkt);
+    return mpu->recvPushUpdate(pkt);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 54ef72d5f9..e4bb83d2bc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,16 +30,20 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
 
 namespace gem5
 {
+
+class MPU;
+
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* owner;
+    MPU* mpu;
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
@@ -48,8 +52,8 @@ class PushEngine : public BasePushEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-}
+};
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9608d0cbc4..40ec755969 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,24 +27,25 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
-#include <string>
-
+#include "accl/graph/sega/mpu.hh"
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params)
+    BaseWLEngine(params),
+    mpu(params.mpu)
 {}
 
-virtual bool
+bool
 WLEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
 // FIXME: handle the case where Apply queue is full
-virtual bool
-WLEngine::sendWLNotif(WorkListItem wl){
-    mpu->recvWLNotif(wl);
+bool
+WLEngine::sendWLNotif(Addr addr){
+    mpu->recvWLNotif(addr);
     return true;
-}
\ No newline at end of file
+}
+
+}
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 938128e05f..c5f49ff6a2 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,16 +45,23 @@
 namespace gem5
 {
 
-class WLEngine : public BaseWorkListEngine
+// class MPU;
+
+class WLEngine : public BaseWLEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool sendWLNotif(WorkListItem wl);
+    virtual bool sendWLNotif(Addr addr);
+
   public:
+
     WLEngine(const WLEngineParams &params);
 };
 
 }
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 69078fe8f1..69686e7835 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -179,7 +179,6 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
-        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -269,8 +268,6 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
-    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
-
     Command
     responseCommand() const
     {

From 7afc5fcc7791c634a954c855090ea4aa0bb42f00 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:59:31 -0800
Subject: [PATCH 045/279] Style fix.

---
 src/accl/graph/base/BaseApplyEngine.py | 2 +-
 src/accl/graph/base/BasePushEngine.py  | 1 -
 src/accl/graph/base/BaseWLEngine.py    | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 45d94b3fd2..e48b425b01 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/graph/base/base_apply_engine.hh"
+    cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 891221c06d..793b0a7c92 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -33,4 +33,3 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 3ecf030138..473fd05313 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -35,4 +35,3 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-

From 8319d4a5f8191bff90d34c715f3ae81ee6732068 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:20:51 -0800
Subject: [PATCH 046/279] Adding PARAMS macro.

---
 src/accl/graph/base/SConscript           | 6 +++---
 src/accl/graph/base/base_apply_engine.hh | 2 ++
 src/accl/graph/base/base_push_engine.hh  | 2 ++
 src/accl/graph/base/base_wl_engine.hh    | 3 +++
 src/accl/graph/sega/SConscript           | 8 ++++----
 src/accl/graph/sega/apply_engine.hh      | 2 +-
 src/accl/graph/sega/mpu.hh               | 2 +-
 src/accl/graph/sega/push_engine.hh       | 1 +
 src/accl/graph/sega/wl_engine.hh         | 2 +-
 9 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..5e82a44971 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
+SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
+SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index b7c0db90cb..fbcf95c238 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -65,6 +65,8 @@ class BaseApplyEngine : public ClockedObject
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
+    PARAMS(BaseApplyEngine);
+
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index c723932975..446f6a1186 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -85,6 +85,8 @@ class BasePushEngine : public ClockedObject
 
   public:
 
+    PARAMS(BasePushEngine);
+
     BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a2cab4c7e2..4cb492914c 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -104,6 +104,9 @@ class BaseWLEngine : public ClockedObject
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
+
+    PARAMS(BaseWLEngine);
+
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..793dacc2ef 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('MPU.py')
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
+SimObject('MPU.py', sim_objects=["MPU"])
+SimObject('PushEngine.py', sim_objects=["PushEngine"])
+SimObject('WLEngine.py', sim_objects=["WLEngine"])
 
 Source('apply_engine.cc')
 Source('mpu.cc')
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 17e3280cb5..c7d3073e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -56,7 +56,7 @@ class ApplyEngine : public BaseApplyEngine
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
-
+    PARAMS(ApplyEngine);
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index be5139c0e0..cf241c9063 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -119,7 +119,7 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt);
 
   public:
-
+    PARAMS(MPU);
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e4bb83d2bc..1a800e58f3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -47,6 +47,7 @@ class PushEngine : public BasePushEngine
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
+    PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c5f49ff6a2..238ffbe724 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -59,7 +59,7 @@ class WLEngine : public BaseWLEngine
     virtual bool sendWLNotif(Addr addr);
 
   public:
-
+    PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
 };
 

From 77d8e1a2f9e8eafae1a0dab8d324835757a5fdcf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:33:13 -0800
Subject: [PATCH 047/279] First compilation after restructure.

---
 src/accl/graph/base/BaseApplyEngine.py | 1 +
 src/accl/graph/base/BasePushEngine.py  | 1 +
 src/accl/graph/base/BaseWLEngine.py    | 1 +
 src/accl/graph/base/SConscript         | 6 +++---
 src/accl/graph/sega/SConscript         | 8 ++++----
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index e48b425b01..fdabefc732 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseApplyEngine(ClockedObject):
+    abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 793b0a7c92..d30124a6a4 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BasePushEngine(ClockedObject):
+    abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 473fd05313..7dcacefd97 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseWLEngine(ClockedObject):
+    abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 5e82a44971..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
-SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
-SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
+SimObject('BaseApplyEngine.py')
+SimObject('BasePushEngine.py')
+SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 793dacc2ef..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
-SimObject('MPU.py', sim_objects=["MPU"])
-SimObject('PushEngine.py', sim_objects=["PushEngine"])
-SimObject('WLEngine.py', sim_objects=["WLEngine"])
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
 Source('mpu.cc')

From 196679c942dde2e95bddb110593975b9899fc283 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 07:38:41 -0800
Subject: [PATCH 048/279] Adding config file for SEGA and missing ports.

---
 configs/accl/sega.py       | 34 ++++++++++++++++++++++++++++++++++
 src/accl/graph/sega/MPU.py | 10 +++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 configs/accl/sega.py

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..288b1211e4
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,34 @@
+import m5
+from m5.objects import *
+
+class PyMPU(MPU):
+    def __init__(self, clk_domain):
+        super().__init__()
+        self.clk_domain = clk_domain
+        self.apply_engine = ApplyEngine()
+        self.push_engine = PushEngine()
+        self.wl_engine = WLEngine()
+
+class SEGA(System):
+
+    def __init__(self):
+        super(SEGA, self).__init__()
+        # Set up the clock domain and the voltage domain
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mpu = PyMPU(self.clk_domain)
+        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mpu.memPort = self.mem_ctrl.port
+        self.mpu.reqPort = self.mpu.respPort
+
+
+system = SEGA()
+root = Root(full_system = False, system = system)
+
+m5.instantiate()
+
+exit_event = m5.simulate()
+print("Simulation finished!")
+exit()
\ No newline at end of file
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 68cfb3d42d..efd8dbc11f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,13 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    work_list_engine = Param.WLEngine("WLEngine object to connect to "
+    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
                     "This MPU")
-    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
+    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    push_engine = Param.PushEngine("PushEngine object to connect to "
+    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
                     "This MPU")
+
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")

From d5eb804b7727a44fb6106c77a3a580ca7b54bd2f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 12:22:14 -0800
Subject: [PATCH 049/279] Adding BaseEngine class and started pointer fix.

---
 src/accl/graph/base/BaseApplyEngine.py   |  4 +-
 src/accl/graph/base/BaseEngine.py        | 38 ++++++++++
 src/accl/graph/base/BasePushEngine.py    |  2 +
 src/accl/graph/base/BaseWLEngine.py      |  1 +
 src/accl/graph/base/base_apply_engine.cc | 22 +-----
 src/accl/graph/base/base_apply_engine.hh |  9 +--
 src/accl/graph/base/base_engine.cc       | 75 ++++++++++++++++++++
 src/accl/graph/base/base_engine.hh       | 90 ++++++++++++++++++++++++
 src/accl/graph/sega/ApplyEngine.py       |  2 +-
 9 files changed, 213 insertions(+), 30 deletions(-)
 create mode 100644 src/accl/graph/base/BaseEngine.py
 create mode 100644 src/accl/graph/base/base_engine.cc
 create mode 100644 src/accl/graph/base/base_engine.hh

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index fdabefc732..be849ed1af 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseApplyEngine(ClockedObject):
+class BaseApplyEngine(BaseEngine):
     abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
new file mode 100644
index 0000000000..3eb5f0cbbc
--- /dev/null
+++ b/src/accl/graph/base/BaseEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseEngine(ClockedObject):
+    abstract = True
+    type = 'BaseEngine'
+    cxx_header = "accl/graph/base/base_engine.hh"
+    cxx_class = 'gem5::BaseEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index d30124a6a4..c52a65abf9 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,3 +34,5 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7dcacefd97..ec34b52005 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -36,3 +36,4 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 731cd5c345..4fd53fb037 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -36,31 +36,12 @@ namespace gem5
 {
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
+    BaseEngine(params),
     queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-Port &
-BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
-{
-        return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseApplyEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BaseApplyEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool BaseApplyEngine::recvWLNotif(Addr addr){
     // TODO: Investigate the situation where the queue is full.
     // if (applyReadQueue.size() == queueSize){
@@ -82,6 +63,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
+    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
     if (sendMemReq(memPkt)){
         applyReadQueue.pop();
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index fbcf95c238..f81f23428e 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -42,12 +43,10 @@
 namespace gem5
 {
 
-class BaseApplyEngine : public ClockedObject
+class BaseApplyEngine : public BaseEngine
 {
   private:
 
-    RequestorID requestorId;
-
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
     int queueSize;
@@ -61,7 +60,6 @@ class BaseApplyEngine : public ClockedObject
     void processNextApplyEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -72,9 +70,6 @@ class BaseApplyEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvWLNotif(Addr addr);
     bool handleMemResp(PacketPtr resp);
 };
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
new file mode 100644
index 0000000000..d53e2e683a
--- /dev/null
+++ b/src/accl/graph/base/base_engine.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_engine.hh"
+
+namespace gem5
+{
+
+BaseEngine::BaseEngine(const BaseEngineParams &params) :
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId()),
+    memPort(name() + ".memPort", this)
+{}
+
+
+void
+BaseEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+
+}
+
+void
+BaseEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
new file mode 100644
index 0000000000..f9f500e118
--- /dev/null
+++ b/src/accl/graph/base/base_engine.hh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+    System* system;
+    const RequestorID requestorId;
+    MemPort memPort;
+
+  protected:
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+  public:
+    PARAMS(BaseEngine);
+
+    BaseEngine(const BaseEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index bb43836ff7..5bb0dc0c25 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -34,4 +34,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_header = "accl/graph/sega/apply_engine.hh"
     cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
+    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")

From 422248c0b97774a048aeae6d3c1966044eba0882 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 21:44:29 -0800
Subject: [PATCH 050/279] Cont. fixing pointer issue.

---
 src/accl/graph/base/BaseApplyEngine.py   |  2 -
 src/accl/graph/base/BaseWLEngine.py      |  7 +--
 src/accl/graph/base/base_apply_engine.cc | 53 ++++++++--------
 src/accl/graph/base/base_apply_engine.hh |  4 +-
 src/accl/graph/base/base_wl_engine.cc    | 79 +++++++-----------------
 src/accl/graph/base/base_wl_engine.hh    | 52 +++-------------
 6 files changed, 63 insertions(+), 134 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index be849ed1af..9b240581ac 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -34,5 +34,3 @@ class BaseApplyEngine(BaseEngine):
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
-
-    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index ec34b52005..7311c396b3 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -27,13 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseWLEngine(ClockedObject):
+class BaseWLEngine(BaseEngine):
     abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
-
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 4fd53fb037..7f6c32cf39 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,34 +37,35 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-bool BaseApplyEngine::recvWLNotif(Addr addr){
+bool
+BaseApplyEngine::recvWLNotif(Addr addr)
+{
     // TODO: Investigate the situation where the queue is full.
-    // if (applyReadQueue.size() == queueSize){
-    //     //  applyReadQueue.sendPktRetry = true;
-    //     return true;
-    // } else{
     applyReadQueue.push(addr);
-    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-void BaseApplyEngine::processNextApplyCheckEvent(){
+void
+BaseApplyEngine::processNextApplyCheckEvent()
+{
+    // TODO: We might want to change the way this function
+    // pops items off queue, maybe we should pop every n cycles
+    // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
-    if (sendMemReq(memPkt)){
+    if (!memPortBlocked()) {
+        sendMemReq(memPkt);
         applyReadQueue.pop();
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -75,7 +76,6 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
 bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    // FIXME: change the event, remove the retry parts
     applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
@@ -84,38 +84,39 @@ BaseApplyEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseApplyEngine::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent()
+{
     PacketPtr pkt = applyWriteQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    uint32_t prop = wl.prop;
-    uint32_t temp_prop = wl.temp_prop;
 
-    if (temp_prop != prop) {
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
+    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
+    // fwd it to applyengine?
+    if (wl.temp_prop < wl.prop) {
         // TODO: instead of min add a Reduce function.
         //update prop with temp_prop
-        if(prop < temp_prop) {
-            wl.prop = prop;
-        }else {
-            wl.prop = temp_prop;
-        }
+        wl.prop = wl.temp_prop;
         //write back the new worklist item to  memory
         uint8_t* wList = workListToMemory(wl);
         memcpy(data + request_offset, wList, sizeof(WorkListItem));
         //Create memory write requests.
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-            applyWriteQueue.pop();
+
+        if (!memPortBlocked()) {
+            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+                sendMemReq(writePkt);
+                applyWriteQueue.pop();
+            }
         }
-    }else {
+    } else {
         applyWriteQueue.pop();
     }
-    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
+    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index f81f23428e..dc7188ab56 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -37,7 +37,6 @@
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
 namespace gem5
@@ -60,6 +59,7 @@ class BaseApplyEngine : public BaseEngine
     void processNextApplyEvent();
 
   protected:
+    virtual bool handleMemResp(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -71,7 +71,7 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 806ab4a6c3..aab39fb7a3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -34,61 +34,37 @@ namespace gem5
 {
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
-    updateQueue(params.wlQueueSize),
-    responseQueue(params.wlQueueSize),
+    BaseEngine(params),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
     nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
 {}
 
-Port &
-BaseWLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseWLEngine::getRequestorId()
+bool
+BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 {
-    return requestorId;
+    updateQueue.push(pkt);
+    if(!nextWLReadEvent.scheduled()) {
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
 }
 
-void
-BaseWLEngine::setRequestorId(RequestorID requestorId)
+void BaseWLEngine::processNextWLReadEvent()
 {
-    this->requestorId = requestorId;
-}
+    PacketPtr pkt = updateQueue.front();
 
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = updateQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else
-        queue.push(pkt);
+    Addr addr = pkt->getAddr();
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
 
-    if(!nextWLReadEvent.scheduled()){
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffsetMap[request] = req_offset;
 
-void BaseWLEngine::processNextWLReadEvent(){
-    auto queue = updateQueue;
-    PacketPtr pkt = queue.front();
-    /// conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
-    RequestPtr request =
-        std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
-    if (sendMemReq(memPkt)){
-        queue.pop();
+    if (memPortBlocked()) {
+        sendMemReq(memPkt)
+        updateQueue.pop();
     }
-    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -96,24 +72,15 @@ void BaseWLEngine::processNextWLReadEvent(){
 bool
 BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = responseQueue;
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else{
-            queue.push(pkt);
-        }
-        if(!nextWLReduceEvent.scheduled()){
-            schedule(nextWLReduceEvent, nextCycle());
-        }
-        return true;
+    responseQueue.push(pkt);
+    if(!nextWLReduceEvent.scheduled()){
+        schedule(nextWLReduceEvent, nextCycle());
+    }
     return true;
 }
 
 void
 BaseWLEngine::processNextWLReduceEvent(){
-    auto queue = responseQueue;
-    auto updateQ = updateQueue;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 4cb492914c..063e9909be 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -32,57 +32,26 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseWLEngine : public ClockedObject
+class BaseWLEngine : public BaseEngine
 {
   private:
-    //FIXME: Change this
-    struct WLQueue{
-      std::queue<PacketPtr> wlQueue;
-      uint32_t queueSize;
-      bool sendPktRetry;
-
-      void resize(uint32_t size){
-        queueSize = size;
-      }
-
-      bool blocked(){
-        return (wlQueue.size() == queueSize);
-      }
-      bool empty(){
-        return wlQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        wlQueue.push(pkt);
-      }
-      void pop(){
-        wlQueue.pop();
-      }
-      PacketPtr front(){
-        return wlQueue.front();
-      }
-
-      WLQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
-    };
-
-    RequestorID requestorId;
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::queue<PacketPtr> updateQueue;
+    std::queue<PacketPtr> responseQueue;
+
+    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
+    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
 
     //Events
     EventFunctionWrapper nextWLReadEvent;
@@ -100,7 +69,7 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
@@ -112,11 +81,8 @@ class BaseWLEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }

From dfb1ce217908268baa8be2009b1a5362940bfcb6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 10:16:01 -0800
Subject: [PATCH 051/279] Cont. fix pointer issue.

---
 src/accl/graph/base/BasePushEngine.py    |  6 +--
 src/accl/graph/base/base_apply_engine.hh |  1 -
 src/accl/graph/base/base_push_engine.cc  | 19 -------
 src/accl/graph/base/base_push_engine.hh  | 19 ++-----
 src/accl/graph/base/base_wl_engine.cc    | 64 +++++++++---------------
 5 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index c52a65abf9..2163864be3 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -27,12 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BasePushEngine(ClockedObject):
+class BasePushEngine(BaseEngine):
     abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index dc7188ab56..2cb9d8b918 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -48,7 +48,6 @@ class BaseApplyEngine : public BaseEngine
 
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
-    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index d93cbdf8da..f2384c434b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,6 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -46,24 +45,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
 {
 }
 
-Port &
-BasePushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BasePushEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BasePushEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 446f6a1186..f568b6ecc3 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,16 +31,16 @@
 
 #include <queue>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
-#include "sim/clocked_object.hh"
 
 namespace gem5
 {
 
-class BasePushEngine : public ClockedObject
+class BasePushEngine : public BaseEngine
 {
   private:
 
@@ -53,9 +53,6 @@ class BasePushEngine : public ClockedObject
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
-
-    RequestorID requestorId;
-
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
@@ -64,8 +61,6 @@ class BasePushEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
-
     std::queue<PacketPtr> updateQueue;
     // int updateQueueSize;
     // int updateQueueLen;
@@ -80,8 +75,8 @@ class BasePushEngine : public ClockedObject
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
 
@@ -89,14 +84,8 @@ class BasePushEngine : public ClockedObject
 
     BasePushEngine(const BasePushEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool handleMemResp(PacketPtr pkt);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index aab39fb7a3..d5b18bafa0 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,13 +52,15 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
+    uint32_t data = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
 
     PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[request] = req_offset;
+    requestOffsetMap[memPkt->req] = req_offset;
+    requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
         sendMemReq(memPkt)
@@ -80,51 +82,35 @@ BaseWLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseWLEngine::processNextWLReduceEvent(){
-    PacketPtr update = updateQ.front();
-    uint8_t* value = update->getPtr<uint8_t>();
-    PacketPtr pkt = queue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+BaseWLEngine::processNextWLReduceEvent()
+{
+    PacketPtr resp = responseQueue.front();
+    uint8_t* respData = resp->getPtr<uint8_t>();
+    Addr request_offset = requestOffsetMap[resp->req];
+    uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(data + request_offset);
-    uint32_t temp_prop = wl.temp_prop;
-    if (temp_prop != *value){
+
+    if (value < wl.temp_prop){
         //update prop with temp_prop
-        if(*value < temp_prop){
-            temp_prop = *value;
-        }
-        // if (!memPort.blocked() && !applyPort.blocked()){
-        wl.temp_prop = temp_prop;
-        uint8_t* wlItem = workListToMemory(wl);
-        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        wl.temp_prop = value;
+
+        uint8_t* wlData = workListToMemory(wl);
+        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt->getAddr())) {
-            queue.pop();
-            if (!queue.blocked() && queue.sendPktRetry){
-                queue.sendPktRetry = false;
-            }
-            updateQ.pop();
-            if (!updateQ.blocked() & updateQ.sendPktRetry){
-                // respPort.trySendRetry();
-                updateQ.sendPktRetry = false;
+        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+
+        if (!memPortBlocked()) {
+            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+                sendMemReq(writePkt);
+                responseQueue.pop();
+                // TODO: Erase map entries, delete wlData;
             }
         }
     }
-    else{
-        queue.pop();
-        if (!queue.blocked() && queue.sendPktRetry){
-            queue.sendPktRetry = false;
-        }
-        updateQ.pop();
-        if (!updateQ.blocked() & updateQ.sendPktRetry){
-            updateQ.sendPktRetry = false;
-        }
-
+    else {
+        responseQueue.pop();
     }
-    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }

From f4473eb47ce39cf90627bd21f0620864b6e91d99 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:07:21 -0800
Subject: [PATCH 052/279] Cont. fix pointer issue. MemQ to BaseEngine.

---
 src/accl/graph/base/base_apply_engine.cc | 22 ++-----
 src/accl/graph/base/base_apply_engine.hh | 11 +---
 src/accl/graph/base/base_engine.cc       | 13 +++-
 src/accl/graph/base/base_engine.hh       | 17 +++++-
 src/accl/graph/base/base_push_engine.cc  | 77 ++++++------------------
 src/accl/graph/base/base_push_engine.hh  | 16 +----
 src/accl/graph/base/base_wl_engine.cc    | 22 ++-----
 src/accl/graph/base/base_wl_engine.hh    |  3 +-
 src/accl/graph/sega/mpu.hh               |  2 -
 9 files changed, 65 insertions(+), 118 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7f6c32cf39..842481c2d1 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -73,20 +73,10 @@ BaseApplyEngine::processNextApplyCheckEvent()
     }
 }
 
-bool
-BaseApplyEngine::handleMemResp(PacketPtr pkt)
-{
-    applyWriteQueue.push(pkt);
-    if(!nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseApplyEngine::processNextApplyEvent()
+BaseApplyEngine::processNextMemRespEvent()
 {
-    PacketPtr pkt = applyWriteQueue.front();
+    PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
@@ -110,14 +100,14 @@ BaseApplyEngine::processNextApplyEvent()
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
-                applyWriteQueue.pop();
+                memRespQueue.pop();
             }
         }
     } else {
-        applyWriteQueue.pop();
+        memRespQueue.pop();
     }
-    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 2cb9d8b918..02646a74ff 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,21 +45,17 @@ namespace gem5
 class BaseApplyEngine : public BaseEngine
 {
   private:
-
     std::queue<Addr> applyReadQueue;
-    std::queue<PacketPtr> applyWriteQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
   protected:
-    virtual bool handleMemResp(PacketPtr pkt);
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop,
+            uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
     PARAMS(BaseApplyEngine);
@@ -70,7 +66,6 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index d53e2e683a..6a50e1630e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,7 +35,8 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId()),
-    memPort(name() + ".memPort", this)
+    memPort(name() + ".memPort", this),
+    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
 
@@ -72,4 +73,14 @@ BaseEngine::MemPort::recvReqRetry()
     }
 }
 
+bool
+BaseEngine::handleMemResp(PacketPtr pkt)
+{
+    memRespQueue.push(pkt);
+    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemResponseEvent, nextCycle());
+    }
+    return true;
+}
+
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index f9f500e118..4f5a29676d 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -66,14 +66,28 @@ class BaseEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
+
     System* system;
     const RequestorID requestorId;
     MemPort memPort;
 
+    bool handleMemResp(PacketPtr resp);
+    EventFunctionWrapper nextMemRespEvent;
+
   protected:
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+    // TODO: Add this later, maybe?
+    // int memRespQueueSize;
+    std::queue<PacketPtr> memRespQueue;
+    /* Respective function for nextMemRespEvent.
+    All the classes inheriting from this class will
+    do their main processing in this function. For
+    example, BaseWLEngine reduces the temp_pro with
+    the value of update in this function.
+    */
+    virtual void processNextMemRespEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
@@ -82,7 +96,6 @@ class BaseEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f2384c434b..4c43f95939 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -40,7 +40,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     // updateQueue(params.update_queue_size),
     // updateQueueLen(0),
     nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextSendEvent([this] { processNextSendEvent(); }, name())
 {
 }
@@ -49,16 +48,6 @@ bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
-    //FIXME: There should be a check if the queues are full.
-    // if (vertexQueueLen < vertexQueueSize) {
-    //     vertexQueue.push(pkt)
-    //         vertexQueueLen++;
-    //     if (!nextReceiveEvent.scheduled()) {
-    //         schedule(nextReceiveEvent, nextCycle());
-    //     }
-    //     return true;
-    // }
-    // return false;
     notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
@@ -67,7 +56,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
 }
 
 void
-BasePushEngine::processNextReceiveEvent()
+BasePushEngine::processNextReadEvent()
 {
     ApplyNotif notif = notifQueue.front();
 
@@ -95,39 +84,28 @@ BasePushEngine::processNextReceiveEvent()
             offset_queue.push_back(req_offset);
             num_edge_queue.push_back(1);
         }
-    }
+    };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-        memReqQueue.push(pkt);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = notif.prop;
+        if (!memPortBlocked()) {
+            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+            reqOffsetMap[pkt->req] = offset_queue[index];
+            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+            reqValueMap[pkt->req] = notif.prop;
+            sendMemReq(pkt);
+            notifQueue.pop();
+        }
     }
 
-    notifQueue.pop();
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-BasePushEngine::processNextReadEvent()
-{
-    PacketPtr pkt = memReqQueue.front();
-    if (!sendMemReq(pkt)) {
-        memReqQueue.pop();
-    }
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-bool
-BasePushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::processNextMemRespEvent()
 {
+    PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
@@ -137,7 +115,7 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
 
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
         uint32_t *update_data = new uint32_t;
 
@@ -146,29 +124,14 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        updateQueue.push(update);
-    }
-
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
-    }
-
-    //TODO: Should we always return true? It's the response from the memory
-    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
-    // higher than its demand bandwidth
-    return true;
-}
-
-void
-BasePushEngine::processNextSendEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    if (!sendPushUpdate(pkt)) {
-        updateQueue.pop();
+        if (sendPushUpdate(update)) {
+            memRespQueue.pop();
+            // TODO: Erase map entries here.
+        }
     }
 
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index f568b6ecc3..5a6ef85b0f 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -43,7 +43,6 @@ namespace gem5
 class BasePushEngine : public BaseEngine
 {
   private:
-
     struct ApplyNotif {
         uint32_t prop;
         uint32_t degree;
@@ -53,30 +52,20 @@ class BasePushEngine : public BaseEngine
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
+
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
-    // int vertexQueueLen;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> updateQueue;
-    // int updateQueueSize;
-    // int updateQueueLen;
-
-    EventFunctionWrapper nextReceiveEvent;
-    void processNextReceiveEvent();
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextSendEvent;
-    void processNextSendEvent();
-
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -85,7 +74,6 @@ class BasePushEngine : public BaseEngine
     BasePushEngine(const BasePushEngineParams &params);
 
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index d5b18bafa0..5d84e34ccd 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -63,7 +63,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
-        sendMemReq(memPkt)
+        sendMemReq(memPkt);
         updateQueue.pop();
     }
     if (!queue.empty() && !nextWLReadEvent.scheduled()) {
@@ -71,20 +71,10 @@ void BaseWLEngine::processNextWLReadEvent()
     }
 }
 
-bool
-BaseWLEngine::handleMemResp(PacketPtr pkt)
-{
-    responseQueue.push(pkt);
-    if(!nextWLReduceEvent.scheduled()){
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseWLEngine::processNextWLReduceEvent()
+BaseWLEngine::processNextMemRespEvent()
 {
-    PacketPtr resp = responseQueue.front();
+    PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
@@ -102,15 +92,15 @@ BaseWLEngine::processNextWLReduceEvent()
         if (!memPortBlocked()) {
             if (sendWLNotif(pkt->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
-                responseQueue.pop();
+                memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
             }
         }
     }
     else {
-        responseQueue.pop();
+        memRespQueue.pop();
     }
-    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 063e9909be..ab8952de41 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -69,8 +69,8 @@ class BaseWLEngine : public BaseEngine
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -82,7 +82,6 @@ class BaseWLEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleWLUpdate(PacketPtr pkt);
-
 };
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index cf241c9063..8b5ba20b1c 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -129,8 +129,6 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool handleWLUpdate(PacketPtr pkt);
-    bool recvWLNotif(Addr addr);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 };
 

From e5de463465daacebd538c9b598b38eaec642b85a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:21:51 -0800
Subject: [PATCH 053/279] Pointer issue fixed.

---
 src/accl/graph/sega/MPU.py          |  4 ---
 src/accl/graph/sega/WLEngine.py     |  3 +-
 src/accl/graph/sega/apply_engine.cc | 14 +++-----
 src/accl/graph/sega/apply_engine.hh |  7 ++--
 src/accl/graph/sega/mpu.cc          | 55 ++++-------------------------
 src/accl/graph/sega/mpu.hh          | 10 +-----
 src/accl/graph/sega/push_engine.cc  | 15 +-------
 src/accl/graph/sega/push_engine.hh  |  5 ---
 src/accl/graph/sega/wl_engine.cc    | 14 +++-----
 src/accl/graph/sega/wl_engine.hh    |  7 ++--
 10 files changed, 23 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index efd8dbc11f..71b8841b10 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,12 +38,8 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
-                    "This MPU")
     push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
-                    "This MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 12fbcf9b4f..3bfe9fa16f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,4 +34,5 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
+    apply_engine = Param.ApplyEngine(Parent.any,
+            "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc45850041..0f686e7f8c 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,24 +27,20 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    mpu(params.mpu)
+    pushEngine(params.push_engine)
 {}
 
 bool
-ApplyEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
 
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
-    mpu->recvApplyNotif(prop, degree, edgeIndex);
-    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index c7d3073e36..4d828c6aa1 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,17 +42,14 @@
 namespace gem5
 {
 
-class MPU;
+class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-
-    MPU* mpu;
+    PushEngine* pushEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4824bcd699..23a777d1c6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -33,12 +33,9 @@ namespace gem5
 
 MPU::MPU(const MPUParams &params):
     ClockedObject(params),
-    nextRequestorId(0),
     respPort(name() + ".respPort", this),
     reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    applyEngine(params.apply_engine),
-    pushEngine(params.push_engine),
     wlEngine(params.work_list_engine)
 {}
 
@@ -59,16 +56,6 @@ MPU::getPort(const std::string &if_name, PortID idx)
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId()) == -1) {
-        applyEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) pushEngine->getRequestorId()) == -1) {
-        pushEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) wlEngine->getRequestorId()) == -1) {
-        wlEngine->setRequestorId(nextRequestorId++);
-    }
-
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
@@ -177,9 +164,7 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 bool
 MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    //TODO: Investigate sending true all the time
-    owner->handleMemResp(pkt);
-    return true;
+    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
 }
 
 void
@@ -224,16 +209,7 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    RequestorID requestorId = pkt->requestorId();
-    if (applyEngine->getRequestorId() == requestorId) {
-        applyEngine->handleMemResp(pkt);
-    } else if (pushEngine->getRequestorId() == requestorId) {
-        pushEngine->handleMemResp(pkt);
-    } else if (wlEngine->getRequestorId() == requestorId) {
-        wlEngine->handleMemResp(pkt);
-    } else {
-        panic("Received a response with an unknown requestorId.");
-    }
+    panic("MPU::handleMemResp called!");
 }
 
 bool
@@ -242,39 +218,20 @@ MPU::handleWLUpdate(PacketPtr pkt)
     return wlEngine->handleWLUpdate(pkt);
 }
 
-bool
-MPU::recvWLNotif(Addr addr)
-{
-    return applyEngine->recvWLNotif(addr);
-}
-
-bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edge_index);
-}
-
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
     for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (memPort.blocked()) {
-                return false;
-            } else {
-                memPort.sendPacket(pkt);
-                return true;
-            }
+            return handleWLUpdate(pkt);
         }
     }
-
-    if (reqPort.blocked()) {
-        return false;
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
     }
-    reqPort.sendPacket(pkt);
     return true;
-
 }
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8b5ba20b1c..2df8993749 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -103,18 +103,13 @@ class MPU : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    virtual void startup();
-
-    RequestorID nextRequestorId;
-
     MPURespPort respPort;
     MPUReqPort reqPort;
     MPUMemPort memPort;
 
-    ApplyEngine* applyEngine;
-    PushEngine* pushEngine;
     WLEngine* wlEngine;
 
+    virtual void startup();
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
@@ -125,9 +120,6 @@ class MPU : public ClockedObject
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
     bool handleWLUpdate(PacketPtr pkt);
     bool recvPushUpdate(PacketPtr pkt);
 };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 922ae32ed2..71cb2955fd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,20 +35,7 @@ namespace gem5
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
     mpu(params.mpu)
-{
-}
-
-Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-bool
-PushEngine::sendMemReq(PacketPtr pkt)
-{
-    return mpu->handleMemReq(pkt);
-}
+{}
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1a800e58f3..7b3474d2ec 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -43,16 +43,11 @@ class PushEngine : public BasePushEngine
     MPU* mpu;
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40ec755969..3d9d7af0c6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,25 +27,19 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/apply_engine.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
-    mpu(params.mpu)
+    applyEngine(params.apply_engine)
 {}
 
-bool
-WLEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
-
-// FIXME: handle the case where Apply queue is full
 bool
 WLEngine::sendWLNotif(Addr addr){
-    mpu->recvWLNotif(addr);
-    return true;
+    apply_engine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 238ffbe724..c154867b0d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,17 +45,14 @@
 namespace gem5
 {
 
-// class MPU;
+class ApplyEngine;
 
 class WLEngine : public BaseWLEngine
 {
   private:
-
-    MPU* mpu;
+    ApplyEngine* applyEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendWLNotif(Addr addr);
 
   public:

From 0bf910bba645680e0c2edc7aace4e1643f0ad7a6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:24:07 -0800
Subject: [PATCH 054/279] Adding BaseEngine to SConscript.

---
 src/accl/graph/base/SConscript | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..41c48fc419 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,12 @@
 Import('*')
 
 SimObject('BaseApplyEngine.py')
+SimObject('BaseEngine.py')
 SimObject('BasePushEngine.py')
 SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
+Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')

From 0415b962bf6a72f1bea97b58ceaca90d87f0a8b8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:43:55 -0800
Subject: [PATCH 055/279] Compilation issues fixed. Still linking issues.

---
 src/accl/graph/base/BaseEngine.py        |  1 +
 src/accl/graph/base/base_apply_engine.cc |  3 +--
 src/accl/graph/base/base_engine.cc       |  6 +++---
 src/accl/graph/base/base_engine.hh       | 14 +++++++-------
 src/accl/graph/base/base_push_engine.cc  | 16 +++++-----------
 src/accl/graph/base/base_wl_engine.cc    | 10 +++++-----
 src/accl/graph/sega/MPU.py               |  8 ++------
 src/accl/graph/sega/apply_engine.cc      |  3 +--
 src/accl/graph/sega/apply_engine.hh      |  2 +-
 src/accl/graph/sega/mpu.cc               | 14 --------------
 src/accl/graph/sega/mpu.hh               |  2 --
 src/accl/graph/sega/push_engine.cc       |  1 -
 src/accl/graph/sega/push_engine.hh       |  1 +
 src/accl/graph/sega/wl_engine.cc         |  3 +--
 src/accl/graph/sega/wl_engine.hh         |  1 +
 15 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 3eb5f0cbbc..367df8dbc1 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -35,4 +35,5 @@ class BaseEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_engine.hh"
     cxx_class = 'gem5::BaseEngine'
 
+    system = Param.System(Parent.any, 'System this Engine is a part of')
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 842481c2d1..b7f3030e00 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {}
 
 bool
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6a50e1630e..06827c1d4e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -34,8 +34,8 @@ namespace gem5
 BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
-    requestorId(system->getRequestorId()),
     memPort(name() + ".memPort", this),
+    requestorId(system->getRequestorId(this)),
     nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
@@ -77,8 +77,8 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemResponseEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 4f5a29676d..057a4c6d91 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -68,25 +68,25 @@ class BaseEngine : public ClockedObject
     };
 
     System* system;
-    const RequestorID requestorId;
     MemPort memPort;
 
     bool handleMemResp(PacketPtr resp);
-    EventFunctionWrapper nextMemRespEvent;
 
   protected:
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-
+    const RequestorID requestorId;
     // TODO: Add this later, maybe?
     // int memRespQueueSize;
     std::queue<PacketPtr> memRespQueue;
-    /* Respective function for nextMemRespEvent.
-    All the classes inheriting from this class will
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+
+    /* All the classes inheriting from this class will
     do their main processing in this function. For
     example, BaseWLEngine reduces the temp_pro with
     the value of update in this function.
     */
+    EventFunctionWrapper nextMemRespEvent;
     virtual void processNextMemRespEvent() = 0;
 
   public:
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 4c43f95939..187eefe01b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -34,23 +34,17 @@ namespace gem5
 {
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    ClockedObject(params),
-    // vertexQueueSize(params.vertex_queue_size),
-    // vertexQueueLen(0),
-    // updateQueue(params.update_queue_size),
-    // updateQueueLen(0),
-    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextSendEvent([this] { processNextSendEvent(); }, name())
-{
-}
+    BaseEngine(params),
+    nextReadEvent([this] { processNextReadEvent(); }, name())
+{}
 
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
     notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReceiveEvent.scheduled()) {
-        schedule(nextReceiveEvent, nextCycle());
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 5d84e34ccd..20abaa7b20 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,7 +52,7 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    uint32_t data = *(pkt->getPtr<uint32_t>());
+    uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
@@ -66,7 +66,7 @@ void BaseWLEngine::processNextWLReadEvent()
         sendMemReq(memPkt);
         updateQueue.pop();
     }
-    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
+    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -78,7 +78,7 @@ BaseWLEngine::processNextMemRespEvent()
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(data + request_offset);
+    WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
     if (value < wl.temp_prop){
         //update prop with temp_prop
@@ -87,10 +87,10 @@ BaseWLEngine::processNextMemRespEvent()
         uint8_t* wlData = workListToMemory(wl);
         memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+        getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
         if (!memPortBlocked()) {
-            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+            if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 71b8841b10..87de0fb7d6 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,18 +28,14 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-
-# from m5.objects.WLEngine import WLEngine
-# from m5.objects.PushEngine import PushEngine
-# from m5.objects.ApplyEngine import ApplyEngine
+from m5.objects.WLEngine import WLEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
-                    "This MPU")
+    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 0f686e7f8c..bc3d703cf6 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
@@ -39,7 +38,7 @@ ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
 bool
 ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
+    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
 
 }
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 4d828c6aa1..aff2c5417b 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/ApplyEngine.hh"
@@ -42,7 +43,6 @@
 namespace gem5
 {
 
-class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 23a777d1c6..9bda696cb5 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -198,20 +198,6 @@ MPU::recvFunctional(PacketPtr pkt)
     }
 }
 
-bool
-MPU::handleMemReq(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    memPort.sendPacket(pkt);
-    return true;
-}
-
-void
-MPU::handleMemResp(PacketPtr pkt)
-{
-    panic("MPU::handleMemResp called!");
-}
-
 bool
 MPU::handleWLUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 2df8993749..a0472eead5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 71cb2955fd..a1fa86da2b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7b3474d2ec..edf698011d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3d9d7af0c6..823aa49bb9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
 
 namespace gem5
 {
@@ -39,7 +38,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 
 bool
 WLEngine::sendWLNotif(Addr addr){
-    apply_engine->recvWLNotif(addr);
+    return applyEngine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c154867b0d..6946713aaa 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"

From 74575ea14204edd441e6686afda8479fdc084829 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:19:29 -0800
Subject: [PATCH 056/279] Removing unnecessary includes.

---
 src/accl/graph/base/base_apply_engine.cc | 4 ++--
 src/accl/graph/base/base_apply_engine.hh | 5 +----
 src/accl/graph/base/base_engine.hh       | 4 +---
 src/accl/graph/base/base_push_engine.hh  | 4 +---
 src/accl/graph/base/base_wl_engine.hh    | 8 --------
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index b7f3030e00..009c01ccb7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -59,7 +59,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
-    int req_offset = (addr % 64);
+    Addr req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
@@ -79,7 +79,7 @@ BaseApplyEngine::processNextMemRespEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+    Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 02646a74ff..e3fe47d923 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -33,11 +33,8 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/port.hh"
 
 namespace gem5
 {
@@ -47,7 +44,7 @@ class BaseApplyEngine : public BaseEngine
   private:
     std::queue<Addr> applyReadQueue;
 
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::unordered_map<RequestPtr, Addr> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 057a4c6d91..b0b05d9477 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -34,10 +34,8 @@
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "mem/request.hh"
 #include "params/BaseEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
@@ -79,7 +77,7 @@ class BaseEngine : public ClockedObject
     std::queue<PacketPtr> memRespQueue;
 
     bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
     /* All the classes inheriting from this class will
     do their main processing in this function. For
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 5a6ef85b0f..0da4241dfd 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -32,9 +32,7 @@
 #include <queue>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
-#include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 
 namespace gem5
@@ -54,7 +52,7 @@ class BasePushEngine : public BaseEngine
     };
 
     std::queue<ApplyNotif> notifQueue;
-    // int vertexQueueSize;
+    // int notifQueueSize;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index ab8952de41..3ca9a146a1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -34,12 +34,7 @@
 
 #include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
@@ -78,9 +73,6 @@ class BaseWLEngine : public BaseEngine
 
     BaseWLEngine(const BaseWLEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool handleWLUpdate(PacketPtr pkt);
 };
 

From 91e304792ab6ae533b55d20b3a40d311f4cdd645 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:51:30 -0800
Subject: [PATCH 057/279] Fixing the issue of calling pure virtual function.

---
 src/accl/graph/base/base_apply_engine.cc | 17 +++++++++++++----
 src/accl/graph/base/base_apply_engine.hh |  6 +++++-
 src/accl/graph/base/base_engine.cc       |  7 ++-----
 src/accl/graph/base/base_engine.hh       |  8 +-------
 src/accl/graph/base/base_push_engine.cc  | 17 +++++++++++++----
 src/accl/graph/base/base_push_engine.hh  |  5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++--
 src/accl/graph/base/base_wl_engine.hh    |  2 +-
 8 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 009c01ccb7..e7b7dd6a22 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,7 +37,8 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
 bool
@@ -73,7 +74,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
 }
 
 void
-BaseApplyEngine::processNextMemRespEvent()
+BaseApplyEngine::processNextApplyEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
@@ -105,8 +106,16 @@ BaseApplyEngine::processNextMemRespEvent()
     } else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+BaseApplyEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index e3fe47d923..486fb687fe 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -49,10 +49,14 @@ class BaseApplyEngine : public BaseEngine
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual void processNextMemRespEvent();
+
+    virtual void scheduleMainEvent();
 
   public:
     PARAMS(BaseApplyEngine);
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 06827c1d4e..245192643c 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,8 +35,7 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this)),
-    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
+    requestorId(system->getRequestorId(this))
 {}
 
 
@@ -77,9 +76,7 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
-    }
+    scheduleMainEvent();
     return true;
 }
 
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index b0b05d9477..3436229aa1 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -79,13 +79,7 @@ class BaseEngine : public ClockedObject
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
-    /* All the classes inheriting from this class will
-    do their main processing in this function. For
-    example, BaseWLEngine reduces the temp_pro with
-    the value of update in this function.
-    */
-    EventFunctionWrapper nextMemRespEvent;
-    virtual void processNextMemRespEvent() = 0;
+    virtual void scheduleMainEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 187eefe01b..a963cc9709 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,8 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name())
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 bool
@@ -97,7 +98,7 @@ BasePushEngine::processNextReadEvent()
 }
 
 void
-BasePushEngine::processNextMemRespEvent()
+BasePushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
@@ -124,8 +125,16 @@ BasePushEngine::processNextMemRespEvent()
         }
     }
 
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+BasePushEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
+        schedule(nextPushEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 0da4241dfd..8bb7d6663a 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -61,9 +61,12 @@ class BasePushEngine : public BaseEngine
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 20abaa7b20..ef66603de7 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -72,7 +72,7 @@ void BaseWLEngine::processNextWLReadEvent()
 }
 
 void
-BaseWLEngine::processNextMemRespEvent()
+BaseWLEngine::processNextWLReduceEvent()
 {
     PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
@@ -100,9 +100,18 @@ BaseWLEngine::processNextMemRespEvent()
     else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
 
+void
+BaseWLEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
+        schedule(nextWLReduceEvent, nextCycle());
+    }
+}
+
+
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3ca9a146a1..a5070f0b26 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 

From 22d0fbab1fcba116a2a4cb9f4227b106a8ee36a1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 17:33:02 -0800
Subject: [PATCH 058/279] Fixed cycle in hierarchy and config. Sim starts.

---
 configs/accl/sega.py                     |  40 ++--
 src/accl/graph/base/BaseEngine.py        |   2 +-
 src/accl/graph/base/base_apply_engine.hh |   5 +-
 src/accl/graph/base/base_engine.cc       |  12 ++
 src/accl/graph/base/base_engine.hh       |   5 +-
 src/accl/graph/base/base_push_engine.hh  |   2 +-
 src/accl/graph/base/base_wl_engine.hh    |   2 +-
 src/accl/graph/sega/MPU.py               |  42 -----
 src/accl/graph/sega/PushEngine.py        |   2 +-
 src/accl/graph/sega/SConscript           |   2 -
 src/accl/graph/sega/WLEngine.py          |   1 +
 src/accl/graph/sega/apply_engine.hh      |   3 +-
 src/accl/graph/sega/mpu.cc               | 223 -----------------------
 src/accl/graph/sega/mpu.hh               | 127 -------------
 src/accl/graph/sega/push_engine.cc       |  49 ++++-
 src/accl/graph/sega/push_engine.hh       |  27 ++-
 src/accl/graph/sega/wl_engine.cc         |  88 +++++++++
 src/accl/graph/sega/wl_engine.hh         |  34 +++-
 18 files changed, 238 insertions(+), 428 deletions(-)
 delete mode 100644 src/accl/graph/sega/MPU.py
 delete mode 100644 src/accl/graph/sega/mpu.cc
 delete mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 288b1211e4..ea158ecdc9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,28 +1,46 @@
 import m5
 from m5.objects import *
 
-class PyMPU(MPU):
-    def __init__(self, clk_domain):
-        super().__init__()
-        self.clk_domain = clk_domain
-        self.apply_engine = ApplyEngine()
+class MPU(SubSystem):
+    def __init__(self):
+        super(MPU, self).__init__()
         self.push_engine = PushEngine()
-        self.wl_engine = WLEngine()
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.interconnect = SystemXBar()
 
-class SEGA(System):
+        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
+        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.push_engine.mem_port
+
+    def getRespPort(self):
+        return self.wl_engine.resp_port
+    def setRespPort(self, port):
+        self.wl_engine.resp_port = port
+
+    def getReqPort(self):
+        return self.push_engine.req_port
+    def setReqPort(self, port):
+        self.push_engine.req_port = port
 
+    def getMemPort(self):
+        return self.interconnect.mem_side_ports
+    def setMemPort(self, port):
+        self.interconnect.mem_side_ports = port
+
+class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-        # Set up the clock domain and the voltage domain
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = PyMPU(self.clk_domain)
+        self.mpu = MPU()
         self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-        self.mpu.memPort = self.mem_ctrl.port
-        self.mpu.reqPort = self.mpu.respPort
 
+        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.mpu.setMemPort(self.mem_ctrl.port)
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 367df8dbc1..16c2f402e5 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -36,4 +36,4 @@ class BaseEngine(ClockedObject):
     cxx_class = 'gem5::BaseEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
-    memPort  = RequestPort("Port to communicate with the memory")
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 486fb687fe..9111bd074b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -56,16 +56,13 @@ class BaseApplyEngine : public BaseEngine
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
 
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
     PARAMS(BaseApplyEngine);
 
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool recvWLNotif(Addr addr);
 };
 
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 245192643c..6b40ba4137 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -38,6 +38,18 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     requestorId(system->getRequestorId(this))
 {}
 
+BaseEngine::~BaseEngine()
+{}
+
+Port&
+BaseEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
 
 void
 BaseEngine::MemPort::sendPacket(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 3436229aa1..53415ddc7c 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/BaseEngine.hh"
@@ -78,6 +79,8 @@ class BaseEngine : public ClockedObject
 
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
     virtual void scheduleMainEvent() = 0;
 
@@ -85,7 +88,7 @@ class BaseEngine : public ClockedObject
     PARAMS(BaseEngine);
 
     BaseEngine(const BaseEngineParams &params);
-
+    ~BaseEngine();
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 8bb7d6663a..01027d2791 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -66,7 +66,7 @@ class BasePushEngine : public BaseEngine
 
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a5070f0b26..38079f8f94 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
deleted file mode 100644
index 87de0fb7d6..0000000000
--- a/src/accl/graph/sega/MPU.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-from m5.objects.WLEngine import WLEngine
-
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
-
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index eb0eed18ab..a743b57262 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,4 +34,4 @@ class PushEngine(BasePushEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
+    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..f20d0e44df 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,11 +28,9 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
-SimObject('MPU.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
-Source('mpu.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 3bfe9fa16f..2d650ecb92 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,5 +34,6 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index aff2c5417b..1190786e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -50,7 +50,8 @@ class ApplyEngine : public BaseApplyEngine
     PushEngine* pushEngine;
 
   protected:
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edgeIndex) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
deleted file mode 100644
index 9bda696cb5..0000000000
--- a/src/accl/graph/sega/mpu.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/mpu.hh"
-
-namespace gem5
-{
-
-MPU::MPU(const MPUParams &params):
-    ClockedObject(params),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
-    memPort(name() + ".memPort", this),
-    wlEngine(params.work_list_engine)
-{}
-
-Port&
-MPU::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-MPU::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-AddrRangeList
-MPU::MPURespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-MPU::MPURespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::MPURespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::MPURespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-MPU::MPUReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::MPUReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-MPU::MPUMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
-}
-
-void
-MPU::MPUMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-AddrRangeList
-MPU::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-MPU::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-MPU::handleWLUpdate(PacketPtr pkt)
-{
-    return wlEngine->handleWLUpdate(pkt);
-}
-
-bool
-MPU::recvPushUpdate(PacketPtr pkt)
-{
-    Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRanges()) {
-        if (addr_range.contains(addr)) {
-            return handleWLUpdate(pkt);
-        }
-    }
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return true;
-}
-
-}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
deleted file mode 100644
index a0472eead5..0000000000
--- a/src/accl/graph/sega/mpu.hh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
-#define __ACCL_GRAPH_SEGA_MPU_HH__
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
-#include "params/MPU.hh"
-#include "sim/clocked_object.hh"
-
-namespace gem5
-{
-
-class MPU : public ClockedObject
-{
-  private:
-    class MPURespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-
-      public:
-        MPURespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class MPUReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUReqPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class MPUMemPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUMemPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    MPURespPort respPort;
-    MPUReqPort reqPort;
-    MPUMemPort memPort;
-
-    WLEngine* wlEngine;
-
-    virtual void startup();
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
-  public:
-    PARAMS(MPU);
-    MPU(const MPUParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    bool handleWLUpdate(PacketPtr pkt);
-    bool recvPushUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a1fa86da2b..c7b229ad33 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -33,13 +33,58 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    mpu(params.mpu)
+    reqPort(name() + "reqPort", this)
 {}
 
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return mpu->recvPushUpdate(pkt);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index edf698011d..604df4750d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,6 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -41,14 +40,36 @@ class MPU;
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* mpu;
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
 
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 823aa49bb9..e565ac119b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -33,12 +33,100 @@ namespace gem5
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
     applyEngine(params.apply_engine)
 {}
 
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+}
+
 bool
 WLEngine::sendWLNotif(Addr addr){
     return applyEngine->recvWLNotif(addr);
 }
 
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        sendMemFunctional(pkt);
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 6946713aaa..f895a7ad32 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,14 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/WLEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
-
 
 namespace gem5
 {
@@ -51,14 +44,39 @@ class ApplyEngine;
 class WLEngine : public BaseWLEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
     ApplyEngine* applyEngine;
 
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
   protected:
-    virtual bool sendWLNotif(Addr addr);
+    virtual bool sendWLNotif(Addr addr) override;
 
   public:
     PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 }

From 9e74f271a5812665ee3b71a5256a0cea8ffd39c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:23:54 -0800
Subject: [PATCH 059/279] Started fixing memory leak.

---
 src/accl/graph/base/base_apply_engine.cc | 6 +++---
 src/accl/graph/base/base_push_engine.cc  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e7b7dd6a22..7b643969df 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,9 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = (addr % 64);
-    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
+
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffset[memPkt->req] = req_offset;
     if (!memPortBlocked()) {
         sendMemReq(memPkt);
         applyReadQueue.pop();
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index a963cc9709..6e5aa05779 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -112,7 +112,8 @@ BasePushEngine::processNextPushEvent()
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
-        uint32_t *update_data = new uint32_t;
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
 
         // TODO: Implement propagate function here
         *update_data = value + 1;

From a93ce61a65afb17f03161a3f746452a665f7e73a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:35:05 -0800
Subject: [PATCH 060/279] Adding newlines.

---
 configs/accl/sega.py                | 2 +-
 src/accl/graph/sega/apply_engine.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ea158ecdc9..54970d356e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -49,4 +49,4 @@ def __init__(self):
 
 exit_event = m5.simulate()
 print("Simulation finished!")
-exit()
\ No newline at end of file
+exit()
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc3d703cf6..5d5f8daf26 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -42,4 +42,4 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
-}
\ No newline at end of file
+}

From e45304a469e393aeb011452a790188dde3bd15cb Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 24 Feb 2022 11:43:26 -0800
Subject: [PATCH 061/279] Removed the UpdateWL from the MemCmd.

---
 src/accl/graph/base/util.cc      |  3 ++-
 src/accl/graph/sega/wl_engine.cc | 13 +++++++------
 src/mem/packet.hh                |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 0baa374714..4172607ed0 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -133,7 +133,8 @@ getUpdatePacket(Addr addr, unsigned int size,
     // bits
     req->setPC(((Addr)requestorId) << 2);
 
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
     pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e565ac119b..f3c63e71f3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -120,13 +120,14 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
         sendMemFunctional(pkt);
-    }
+    // }
 }
 
 }
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 69686e7835..e4dab5e551 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -150,7 +150,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        UpdateWL,
+        // UpdateWL,
         NUM_MEM_CMDS
     };
 

From 6ad40e3190027c4c663c76327efd85459e96ba23 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 25 Feb 2022 11:49:51 -0800
Subject: [PATCH 062/279] Adding initial update. Fixing some bugs.

---
 src/accl/graph/base/base_wl_engine.cc | 2 +-
 src/accl/graph/sega/wl_engine.cc      | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index ef66603de7..1b9d92c1b4 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -62,7 +62,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestOffsetMap[memPkt->req] = req_offset;
     requestValueMap[memPkt->req] = value;
 
-    if (memPortBlocked()) {
+    if (!memPortBlocked()) {
         sendMemReq(memPkt);
         updateQueue.pop();
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f3c63e71f3..61bee38c05 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -80,6 +80,15 @@ WLEngine::startup()
                                         16, data, 0);
         sendMemFunctional(pkt);
     }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
 }
 
 bool

From 8c2e290de4c36f9f8d78cef91f74c934c23dd716 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Feb 2022 13:35:24 -0800
Subject: [PATCH 063/279] Adding few debugging flags.

---
 src/accl/graph/base/SConscript           | 2 ++
 src/accl/graph/base/base_apply_engine.cc | 7 +++++++
 src/accl/graph/base/base_push_engine.cc  | 5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 6 ++++++
 src/accl/graph/sega/wl_engine.cc         | 2 +-
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 41c48fc419..c5c8c4e901 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -37,3 +37,5 @@ Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7b643969df..5eb9d90059 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -31,6 +31,8 @@
 #include <string>
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
+
 
 namespace gem5
 {
@@ -83,6 +85,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
+                , __func__, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -101,6 +105,9 @@ BaseApplyEngine::processNextApplyEvent()
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
+                              "into WorkList Item: %s\n"
+                              , __func__, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6e5aa05779..f46941b8ed 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_push_engine.hh"
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
@@ -47,6 +48,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
+    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
     return true;
 }
 
@@ -114,7 +116,6 @@ BasePushEngine::processNextPushEvent()
         Edge e = memoryToEdge(curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
@@ -122,6 +123,8 @@ BasePushEngine::processNextPushEvent()
             requestorId);
         if (sendPushUpdate(update)) {
             memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 1b9d92c1b4..38ebf0f35b 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -27,6 +27,7 @@
  */
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "debug/MPU.hh"
 
 #include <string>
 
@@ -80,6 +81,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
+                , __func__, wl.to_string());
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -89,10 +92,13 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
         }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 61bee38c05..674004d7a5 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 

From 5e34439cb721d2ba8cce9d7b42fa79dcaba9ec5f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:04:53 -0800
Subject: [PATCH 064/279] Adding lock_dir.

---
 configs/accl/sega.py                     |  6 +-
 src/accl/graph/base/base_apply_engine.cc | 19 ++++---
 src/accl/graph/base/base_apply_engine.hh |  3 +-
 src/accl/graph/base/base_wl_engine.cc    | 23 +++++---
 src/accl/graph/base/base_wl_engine.hh    |  2 +
 src/accl/graph/sega/ApplyEngine.py       |  1 +
 src/accl/graph/sega/LockDir.py           | 46 +++++++++++++++
 src/accl/graph/sega/SConscript           |  2 +
 src/accl/graph/sega/WLEngine.py          |  1 +
 src/accl/graph/sega/apply_engine.cc      | 15 ++++-
 src/accl/graph/sega/apply_engine.hh      |  4 ++
 src/accl/graph/sega/lock_dir.cc          | 71 ++++++++++++++++++++++++
 src/accl/graph/sega/lock_dir.hh          | 57 +++++++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 15 ++++-
 src/accl/graph/sega/wl_engine.hh         |  5 +-
 15 files changed, 248 insertions(+), 22 deletions(-)
 create mode 100644 src/accl/graph/sega/LockDir.py
 create mode 100644 src/accl/graph/sega/lock_dir.cc
 create mode 100644 src/accl/graph/sega/lock_dir.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54970d356e..db0bf4678f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,11 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
+        self.lock_dir = LockDirectory()
         self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
         self.interconnect = SystemXBar()
 
+
         self.interconnect.cpu_side_ports = self.wl_engine.mem_port
         self.interconnect.cpu_side_ports = self.apply_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 5eb9d90059..890d5dd313 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,14 +61,16 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = (addr % 64);
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffset[memPkt->req] = req_offset;
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        applyReadQueue.pop();
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffset[memPkt->req] = req_offset;
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            applyReadQueue.pop();
+        }
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -113,6 +115,9 @@ BaseApplyEngine::processNextApplyEvent()
     } else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(pkt->getAddr())) {
+        panic("Could not release an address");
+    }
     if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
         schedule(nextApplyEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 9111bd074b..f4df298079 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -55,7 +55,8 @@ class BaseApplyEngine : public BaseEngine
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 38ebf0f35b..7f1a27aae5 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,16 +56,18 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = addr % 64;
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[memPkt->req] = req_offset;
-    requestValueMap[memPkt->req] = value;
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffsetMap[memPkt->req] = req_offset;
+        requestValueMap[memPkt->req] = value;
 
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        updateQueue.pop();
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            updateQueue.pop();
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -92,7 +94,6 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
-
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
@@ -106,6 +107,10 @@ BaseWLEngine::processNextWLReduceEvent()
     else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(resp->getAddr())) {
+        panic("Could not release an address");
+    }
+    std::cout << "success" << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 38079f8f94..15371f965b 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,6 +65,8 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 5bb0dc0c25..7a446bb620 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -35,3 +35,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_class = 'gem5::ApplyEngine'
 
     push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/LockDir.py
new file mode 100644
index 0000000000..d21963dc3a
--- /dev/null
+++ b/src/accl/graph/sega/LockDir.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2012-2014, 2017-2018 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2007 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class LockDirectory(SimObject):
+    type = 'LockDirectory'
+    cxx_header = 'accl/graph/sega/lock_dir.hh'
+    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f20d0e44df..e6d2f1fbbc 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,9 +28,11 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
+SimObject('LockDir.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
+Source('lock_dir.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 2d650ecb92..b6e697266e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -37,3 +37,4 @@ class WLEngine(BaseWLEngine):
     resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 5d5f8daf26..544bb082ad 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -32,7 +32,8 @@ namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    pushEngine(params.push_engine)
+    pushEngine(params.push_engine),
+    lockDir(params.lock_dir)
 {}
 
 bool
@@ -42,4 +43,16 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
+bool
+ApplyEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+ApplyEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 1190786e36..c88330487a 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -48,10 +49,13 @@ class ApplyEngine : public BaseApplyEngine
 {
   private:
     PushEngine* pushEngine;
+    LockDirectory* lockDir;
 
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edgeIndex) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
new file mode 100644
index 0000000000..b7efa638fe
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/lock_dir.hh"
+
+namespace gem5
+{
+
+LockDirectory::LockDirectory(const LockDirectoryParams &params) :
+    SimObject(params)
+{}
+
+bool
+LockDirectory::acquire(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        lockOwnerMap[addr] = requestorId;
+        lockDegreeMap[addr] = 1;
+        return true;
+    } else if (lockOwnerMap[addr] == requestorId) {
+        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool
+LockDirectory::release(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        panic("Should not relase an address before acquiring");
+    } else if (lockOwnerMap[addr] != requestorId) {
+        panic("Should not release and address you don't own");
+    } else {
+        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
+        if (lockDegreeMap[addr] == 0) {
+            lockDegreeMap.erase(addr);
+            lockOwnerMap.erase(addr);
+            return true;
+        }
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
new file mode 100644
index 0000000000..64d934d42f
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "params/LockDirectory.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class LockDirectory: public SimObject
+{
+  private:
+    std::unordered_map<Addr, RequestorID> lockOwnerMap;
+    std::unordered_map<Addr, int> lockDegreeMap;
+
+  public:
+    PARAMS(LockDirectory);
+    LockDirectory(const LockDirectoryParams &params);
+
+    bool acquire(Addr addr, RequestorID requestorId);
+    bool release(Addr addr, RequestorID requestorId);
+};
+
+}
+
+#endif
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 674004d7a5..e557a08c18 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -34,7 +34,8 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
     respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine)
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
 {}
 
 Port&
@@ -139,4 +140,16 @@ WLEngine::recvFunctional(PacketPtr pkt)
     // }
 }
 
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f895a7ad32..4e8a25795a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -64,13 +65,15 @@ class WLEngine : public BaseWLEngine
 
     RespPort respPort;
     ApplyEngine* applyEngine;
-
+    LockDirectory* lockDir;
 
     virtual void startup();
     void recvFunctional(PacketPtr pkt);
 
   protected:
     virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(WLEngine);

From fb21094049068b427cfced00ed736b2516cc854a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:42:20 -0800
Subject: [PATCH 065/279] Debugging

---
 src/accl/graph/base/base_wl_engine.cc |  6 +++---
 src/accl/graph/sega/wl_engine.cc      | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7f1a27aae5..f5d739da2d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -83,8 +83,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
+                , __func__, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -110,7 +110,7 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success" << std::endl;
+    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e557a08c18..a84ed2d52f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,11 +54,11 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
+                                {1000, 1000, 3, 0}, // Addr: 0
+                                {1000, 1000, 1, 3}, // Addr: 16
+                                {1000, 1000, 1, 4}, // Addr: 32
+                                {10000, 1000, 0, 5}, // Addr: 48
+                                {10000, 10000, 0, 5}  // Addr: 64
                                 };
     Edge edges [6] = {
                     {0, 16}, // Addr: 1048576

From 6d407598a2c420023261170c2d09df912b06f47d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:53:38 -0800
Subject: [PATCH 066/279] More debugging.

---
 configs/accl/sega.py               |  3 ++-
 src/accl/graph/base/base_engine.cc |  3 +++
 src/accl/graph/sega/lock_dir.cc    | 12 ++----------
 src/accl/graph/sega/lock_dir.hh    |  2 +-
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db0bf4678f..db5a36b987 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -39,7 +39,8 @@ def __init__(self):
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
+        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6b40ba4137..f449e6ffdb 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -87,6 +87,9 @@ BaseEngine::MemPort::recvReqRetry()
 bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
+    if (pkt->isResponse() && pkt->isWrite()) {
+        return true;
+    }
     memRespQueue.push(pkt);
     scheduleMainEvent();
     return true;
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
index b7efa638fe..6a4496175d 100644
--- a/src/accl/graph/sega/lock_dir.cc
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -40,10 +40,6 @@ LockDirectory::acquire(Addr addr, RequestorID requestorId)
 {
     if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
         lockOwnerMap[addr] = requestorId;
-        lockDegreeMap[addr] = 1;
-        return true;
-    } else if (lockOwnerMap[addr] == requestorId) {
-        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
         return true;
     } else {
         return false;
@@ -58,12 +54,8 @@ LockDirectory::release(Addr addr, RequestorID requestorId)
     } else if (lockOwnerMap[addr] != requestorId) {
         panic("Should not release and address you don't own");
     } else {
-        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
-        if (lockDegreeMap[addr] == 0) {
-            lockDegreeMap.erase(addr);
-            lockOwnerMap.erase(addr);
-            return true;
-        }
+        lockOwnerMap.erase(addr);
+        return true;
     }
     return false;
 }
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
index 64d934d42f..012334ce43 100644
--- a/src/accl/graph/sega/lock_dir.hh
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -42,7 +42,7 @@ class LockDirectory: public SimObject
 {
   private:
     std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    std::unordered_map<Addr, int> lockDegreeMap;
+    // std::unordered_map<Addr, int> lockDegreeMap;
 
   public:
     PARAMS(LockDirectory);

From bda63d5d2e9a33a785bf28bbd9d20415c2ea7a5d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 16:34:09 -0800
Subject: [PATCH 067/279] Fixed the bugs. Simulation is an endless loop.

---
 configs/accl/sega.py                     |  2 +-
 src/accl/graph/base/base_apply_engine.cc |  7 +++----
 src/accl/graph/base/base_engine.cc       |  6 ++++--
 src/accl/graph/base/base_push_engine.cc  |  2 +-
 src/accl/graph/base/base_wl_engine.cc    | 10 ++++------
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db5a36b987..163ea169d9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -50,6 +50,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate()
+exit_event = m5.simulate(1000000)
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 890d5dd313..e222cb5a76 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,10 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = (addr % 64);
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = (addr % 64);
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffset[memPkt->req] = req_offset;
         if (!memPortBlocked()) {
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index f449e6ffdb..ad87bb3662 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/base/base_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -36,7 +36,9 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     system(params.system),
     memPort(name() + ".memPort", this),
     requestorId(system->getRequestorId(this))
-{}
+{
+    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
+}
 
 BaseEngine::~BaseEngine()
 {}
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f46941b8ed..4ebe40e486 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -121,7 +121,7 @@ BasePushEngine::processNextPushEvent()
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        if (sendPushUpdate(update)) {
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
                 , __func__, e.to_string(), *update_data);
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index f5d739da2d..921e9c683d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,10 +56,9 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = addr % 64;
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
         requestValueMap[memPkt->req] = value;
@@ -98,7 +97,7 @@ BaseWLEngine::processNextWLReduceEvent()
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
                 , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
@@ -110,7 +109,6 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }

From 034fa1f5fb8614b4d5078bd9a71399bb0948e4e3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 18:34:18 -0800
Subject: [PATCH 068/279] Debugged: Releases the address when the memory is
 blocked. Added debugging flgs for validation.

---
 src/accl/graph/base/base_apply_engine.cc | 14 ++++++---
 src/accl/graph/base/base_wl_engine.cc    | 12 ++++++--
 src/accl/graph/sega/wl_engine.cc         | 17 ++++++-----
 src/mem/packet.cc                        | 39 ++++++++++++++++++++++++
 src/mem/packet.hh                        |  2 ++
 5 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e222cb5a76..39f5dafc67 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -86,8 +86,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
+                , __func__, pkt->getAddr() + request_offset, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -102,13 +102,17 @@ BaseApplyEngine::processNextApplyEvent()
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
+
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
-                              "into WorkList Item: %s\n"
-                              , __func__, wl.to_string());
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
+                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 921e9c683d..fd45b85077 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -58,6 +58,7 @@ void BaseWLEngine::processNextWLReadEvent()
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
+
     if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
@@ -67,6 +68,9 @@ void BaseWLEngine::processNextWLReadEvent()
             sendMemReq(memPkt);
             updateQueue.pop();
         }
+        else{
+            releaseAddress(req_addr);
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -82,8 +86,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
-                , __func__, wl.to_string(), value);
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
+                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -93,6 +97,10 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a84ed2d52f..03f74f1019 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,18 +54,19 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {1000, 1000, 3, 0}, // Addr: 0
-                                {1000, 1000, 1, 3}, // Addr: 16
-                                {1000, 1000, 1, 4}, // Addr: 32
-                                {10000, 1000, 0, 5}, // Addr: 48
-                                {10000, 10000, 0, 5}  // Addr: 64
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
                                 };
-    Edge edges [6] = {
+    Edge edges [7] = {
                     {0, 16}, // Addr: 1048576
                     {0, 32}, // Addr: 1048592
                     {0, 48}, // Addr: 1048608
                     {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
                     };
 
     for (int i = 0; i < 5; i++) {
@@ -75,7 +76,7 @@ WLEngine::startup()
         sendMemFunctional(pkt);
     }
 
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..da45246e49 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -532,4 +532,43 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
+std::string
+Packet::printData()
+{
+    char ret[1024];
+    if (isWrite()) {
+        uint8_t* data = getPtr<uint8_t>();
+        std::sprintf(ret,"\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n",
+                        getAddr(),
+                        *((uint32_t*) data),
+                        *((uint32_t*) (data + 4)),
+                        *((uint32_t*) (data + 8)),
+                        *((uint32_t*) (data + 12)),
+                        getAddr() + 16,
+                        *((uint32_t*) (data + 16)),
+                        *((uint32_t*) (data + 20)),
+                        *((uint32_t*) (data + 24)),
+                        *((uint32_t*) (data + 28)),
+                        getAddr() + 32,
+                        *((uint32_t*) (data + 32)),
+                        *((uint32_t*) (data + 36)),
+                        *((uint32_t*) (data + 40)),
+                        *((uint32_t*) (data + 44)),
+                        getAddr() + 48,
+                        *((uint32_t*) (data + 48)),
+                        *((uint32_t*) (data + 52)),
+                        *((uint32_t*) (data + 56)),
+                        *((uint32_t*) (data + 60)));
+    }
+    return ret;
+}
+
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index e4dab5e551..26a7099d53 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1387,6 +1387,8 @@ class Packet : public Printable, public Extensible<Packet>
     template <typename T>
     void setRaw(T v);
 
+    std::string printData();
+
   public:
     /**
      * Check a functional request against a memory value stored in

From 99c512c207a1411c52ab703da3b393ec0a4aa91a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Mar 2022 01:24:54 -0700
Subject: [PATCH 069/279] Adding coalescer to the code.

---
 src/accl/graph/base/BaseReadEngine.py         |  39 ++++
 src/accl/graph/base/BaseReduceEngine.py       |  38 ++++
 src/accl/graph/base/base_read_engine.cc       |  86 ++++++++
 src/accl/graph/base/base_read_engine.hh       | 101 ++++++++++
 src/accl/graph/base/base_reduce_engine.cc     |  51 +++++
 src/accl/graph/base/base_reduce_engine.hh     |  67 +++++++
 .../graph/base/{ => old}/BaseApplyEngine.py   |   0
 src/accl/graph/base/{ => old}/BaseEngine.py   |   0
 .../graph/base/{ => old}/BasePushEngine.py    |   0
 src/accl/graph/base/{ => old}/BaseWLEngine.py |   0
 .../graph/base/{ => old}/base_apply_engine.cc |   0
 .../graph/base/{ => old}/base_apply_engine.hh |   0
 src/accl/graph/base/{ => old}/base_engine.cc  |   0
 src/accl/graph/base/{ => old}/base_engine.hh  |   0
 .../graph/base/{ => old}/base_push_engine.cc  |   0
 .../graph/base/{ => old}/base_push_engine.hh  |   0
 .../graph/base/{ => old}/base_wl_engine.cc    |   0
 .../graph/base/{ => old}/base_wl_engine.hh    |   0
 src/accl/graph/sega/coalesce_engine.cc        | 187 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.hh        |  88 +++++++++
 src/accl/graph/sega/{ => old}/ApplyEngine.py  |   0
 src/accl/graph/sega/{ => old}/LockDir.py      |   0
 src/accl/graph/sega/{ => old}/PushEngine.py   |   0
 src/accl/graph/sega/{ => old}/WLEngine.py     |   0
 src/accl/graph/sega/{ => old}/apply_engine.cc |   0
 src/accl/graph/sega/{ => old}/apply_engine.hh |   0
 src/accl/graph/sega/{ => old}/lock_dir.cc     |   0
 src/accl/graph/sega/{ => old}/lock_dir.hh     |   0
 src/accl/graph/sega/old/push_engine.cc        |  90 +++++++++
 src/accl/graph/sega/old/push_engine.hh        |  77 ++++++++
 src/accl/graph/sega/old/wl_engine.cc          | 156 +++++++++++++++
 src/accl/graph/sega/old/wl_engine.hh          |  86 ++++++++
 src/accl/graph/sega/push_engine.cc            | 144 +++++++++++++-
 src/accl/graph/sega/push_engine.hh            |  32 ++-
 src/accl/graph/sega/wl_engine.cc              | 109 +++++++---
 src/accl/graph/sega/wl_engine.hh              |  37 ++--
 36 files changed, 1338 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/base/BaseReadEngine.py
 create mode 100644 src/accl/graph/base/BaseReduceEngine.py
 create mode 100644 src/accl/graph/base/base_read_engine.cc
 create mode 100644 src/accl/graph/base/base_read_engine.hh
 create mode 100644 src/accl/graph/base/base_reduce_engine.cc
 create mode 100644 src/accl/graph/base/base_reduce_engine.hh
 rename src/accl/graph/base/{ => old}/BaseApplyEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BasePushEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseWLEngine.py (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.hh (100%)
 create mode 100644 src/accl/graph/sega/coalesce_engine.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine.hh
 rename src/accl/graph/sega/{ => old}/ApplyEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/LockDir.py (100%)
 rename src/accl/graph/sega/{ => old}/PushEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/WLEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.cc (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.hh (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.cc (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.hh (100%)
 create mode 100644 src/accl/graph/sega/old/push_engine.cc
 create mode 100644 src/accl/graph/sega/old/push_engine.hh
 create mode 100644 src/accl/graph/sega/old/wl_engine.cc
 create mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
new file mode 100644
index 0000000000..84c53465b9
--- /dev/null
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReadEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReadEngine'
+    cxx_header = "accl/graph/base/base_read_engine.hh"
+    cxx_class = 'gem5::BaseReadEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
new file mode 100644
index 0000000000..4192cdb565
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    memPort(name() + ".mem_port", this),
+    _requestorId(system.getRequestorId(this)),
+{}
+
+BaseReadEngine::~BaseReadEngine()
+{}
+
+Port&
+BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseReadEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
new file mode 100644
index 0000000000..99f14bcb06
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReadEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseReadEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    MemPort memPort;
+
+    bool handleMemResp(PacketPtr resp);
+
+  protected:
+    const RequestorID _requestorId;
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+  public:
+    PARAMS(BaseReadEngine);
+
+    BaseReadEngine(const BaseReadEngineParams &params);
+    ~BaseReadEngine();
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    RequestorID requestorId() { return _requestorId; }
+
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt);
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..fbfc613313
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+void
+BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    currentWorkListAddress = addr;
+    currentWorkList = wl;
+    scheduleReduceEvent();
+}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..e44f384f26
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+
+
+#include "accl/base/util.hh"
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+    bool handleIncomingWL(Addr addr, WorkListItem wl);
+
+  protected:
+    Addr currentWorkListAddress;
+    WorkListItem currentWorkList;
+
+    const RequestorID _requestorId;
+
+    virtual void scheduleReduceEvent() = 0;
+
+  public:
+    PARAMS(BaseReduceEngine);
+
+    BaseReduceEngine(const BaseReduceEngineParams &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseApplyEngine.py
rename to src/accl/graph/base/old/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseEngine.py
rename to src/accl/graph/base/old/BaseEngine.py
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
similarity index 100%
rename from src/accl/graph/base/BasePushEngine.py
rename to src/accl/graph/base/old/BasePushEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseWLEngine.py
rename to src/accl/graph/base/old/BaseWLEngine.py
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.cc
rename to src/accl/graph/base/old/base_apply_engine.cc
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.hh
rename to src/accl/graph/base/old/base_apply_engine.hh
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_engine.cc
rename to src/accl/graph/base/old/base_engine.cc
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_engine.hh
rename to src/accl/graph/base/old/base_engine.hh
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_push_engine.cc
rename to src/accl/graph/base/old/base_push_engine.cc
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_push_engine.hh
rename to src/accl/graph/base/old/base_push_engine.hh
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.cc
rename to src/accl/graph/base/old/base_wl_engine.cc
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.hh
rename to src/accl/graph/base/old/base_wl_engine.hh
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..1f7a94dc7e
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/sega/coalesce_engine.hh"
+
+#include "accl/sega/wl_engine.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
+    BaseReadEngine(params),
+    reqQueueSize(params.req_queue_size),
+    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+{}
+
+CoalesceEngine::~CoalesceEngine()
+{}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    sendMemFunctional(pkt);
+}
+
+void
+CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+{
+    peerWLEngine = wl_engine;
+}
+
+bool
+CoalesceEngine::recvReadAddr(Addr addr)
+{
+    assert(reqQueue.size() <= reqQueueSize);
+    if (reqQueue.size() == reqQueueSize) {
+        return false;
+    }
+
+    reqQueue.push(addr);
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    // TODO: Investigate this for optimization
+    Addr addr = reqQueue.front();
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    if (cacheBlocks[block_index].allocated) {
+        // Hit
+        // TODO: I guess this piece of code code could be optimized.
+        // Not the code per se. The design it represents.
+        if (cacheBlocks[block_index].addr == alligned_addr) {
+            if (!cacheBlocks[block_index].taken[wl_offset]) {
+                if (cacheBlocks[block_index].valid) {
+                    peerWLEngine->handleIncomingWL(addr,
+                        cacheBlocks[block_index].items[wl_offset]);
+                    cacheBlocks[block_index].taken[wl_offset] = true;
+                } else {
+                    cacheBlocks[block_index].pending[wl_offset] = true;
+                }
+                reqQueue.pop();
+            }
+        } else { // conflict
+            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
+            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
+                cacheBlocks[block_index].numConflicts += 1;
+                conflictAddrQueue.push(addr);
+                reqQueue.pop();
+            }
+        }
+    } else {
+        // miss
+        cacheBlocks[block_index].addr = alligned_addr;
+        cacheBlocks[block_index].numConflicts = 0;
+        cacheBlocks[block_index].pending = {false, false, false, false};
+        cacheBlocks[block_index].pending[wl_offset] = true;
+        cacheBlocks[block_index].taken = {false, false, false, false};
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].allocated = true;
+
+        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
+
+        if (!memPortBlocked()) {
+            sendMemReq(pkt);
+            reqQueue.pop();
+        }
+    }
+
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+}
+
+/*
+    void recvWLWrite(Addr addr, WorkListItem wl);
+*/
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    if (pkt->isResp() && pkt->isWrite()) {
+        return true;
+    }
+
+    Addr addr = pkt->getAddr();
+    uint8_t data = pkt->getPtr<uint8_t>();
+
+    int block_index = addr % 256;
+    cacheBlocks[block_index].valid = true;
+
+    for (i = 0; i < 4; i++) {
+        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].taken[i] = false;
+        if (cacheBlocks[block_index].pending[i]) {
+            peerWLEngine->handleIncomingWL(addr + (i * 16),
+                cacheBlocks[block_index].items[i]);
+            cacheBlocks[block_index].taken[i] = true;
+        }
+        cacheBlocks[block_index].pending = false;
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    assert(cacheBlocks[block_index].taken[wl_offset]);
+    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].taken[wl_offset] = false;
+
+    bool taken_item = false;
+    taken_item &= (cacheBlocks[block_index].taken[0] &
+                    cacheBlocks[block_index].taken[1] &
+                    cacheBlocks[block_index].taken[2] &
+                    cacheBlocks[block_index].taken[3]);
+
+    if (!taken_item) {
+        for (auto conflictAddr : conflictAddrQueue) {
+            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
+            if (conflict_block_index == block_index) {
+                // Evict cacheBlocks[block_index]
+                // Respond to conflictAddr
+            }
+        }
+    }
+
+}
+
+}
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..0b349b2c1a
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+class WLEngine;
+
+class CoalesceEngine : public BaseReadEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem items[4];
+        Addr addr;
+        int numConflicts;
+        bool pending[4];
+        bool taken[4];
+        bool valid;
+        bool allocated;
+    };
+
+    WLEngine* peerWLEngine;
+
+    Block cacheBlocks[256];
+
+    int reqQueueSize;
+    std::queue<Addr> reqQueue;
+
+    int conflictAddrQueueSize;
+    std::queue<Addr> conflictAddrQueue;
+
+    EventFunctionWrapper nextRespondEvent;
+    void processNextRespondEvent();
+
+    EventFunctionWrapper nextApplyAndCommitEvent;
+    void processNextApplyAndCommitEvent();
+
+  protected:
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(CoalesceEngine);
+
+    CoalesceEngine(const CoalesceEngineParams &params);
+    ~CoalesceEngine();
+
+    void recvFunctional(PacketPtr pkt);
+
+    bool recvReadAddr(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    void registerWLEngine(WLEngine* wl_engine);
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
similarity index 100%
rename from src/accl/graph/sega/ApplyEngine.py
rename to src/accl/graph/sega/old/ApplyEngine.py
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/old/LockDir.py
similarity index 100%
rename from src/accl/graph/sega/LockDir.py
rename to src/accl/graph/sega/old/LockDir.py
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
similarity index 100%
rename from src/accl/graph/sega/PushEngine.py
rename to src/accl/graph/sega/old/PushEngine.py
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
similarity index 100%
rename from src/accl/graph/sega/WLEngine.py
rename to src/accl/graph/sega/old/WLEngine.py
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
similarity index 100%
rename from src/accl/graph/sega/apply_engine.cc
rename to src/accl/graph/sega/old/apply_engine.cc
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
similarity index 100%
rename from src/accl/graph/sega/apply_engine.hh
rename to src/accl/graph/sega/old/apply_engine.hh
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
similarity index 100%
rename from src/accl/graph/sega/lock_dir.cc
rename to src/accl/graph/sega/old/lock_dir.cc
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
similarity index 100%
rename from src/accl/graph/sega/lock_dir.hh
rename to src/accl/graph/sega/old/lock_dir.hh
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
new file mode 100644
index 0000000000..c7b229ad33
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngineParams &params) :
+    BasePushEngine(params),
+    reqPort(name() + "reqPort", this)
+{}
+
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
new file mode 100644
index 0000000000..604df4750d
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BasePushEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+  protected:
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
new file mode 100644
index 0000000000..03f74f1019
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+#include "debug/MPU.hh"
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
+{}
+
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
+                                };
+    Edge edges [7] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 7; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
+}
+
+bool
+WLEngine::sendWLNotif(Addr addr){
+    return applyEngine->recvWLNotif(addr);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
+        sendMemFunctional(pkt);
+    // }
+}
+
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
+}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
new file mode 100644
index 0000000000..4e8a25795a
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.hh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class ApplyEngine;
+
+class WLEngine : public BaseWLEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
+    ApplyEngine* applyEngine;
+    LockDirectory* lockDir;
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
+  protected:
+    virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c7b229ad33..c865451999 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,9 +31,16 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
+PushEngine::PushEngine(const PushEngineParams &params):
+    BaseReadEngine(params),
+    reqPort(name() + ".req_port", this),
+    baseEdgeAddr(params.base_edge_addr),
+    memRespQueueSize(params.mem_resp_queue_size),
+    pushReqQueueSize(params.push_req_queue_size),
+    onTheFlyReadReqs(0),
+    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 Port&
@@ -41,8 +48,10 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
+    } else if (if_name == "mem_port") {
+        return BaseReadEngine::getPort(if_name, idx);
     } else {
-        return BasePushEngine::getPort(if_name, idx);
+        return SimObject::getPort(if_name, idx);
     }
 }
 
@@ -78,13 +87,130 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
+PushEngine::recvWLItem(WorkListItem wl);
 {
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
+    assert(pushReqQueue.size() <= pushReqQueueSize);
+    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
+        return false;
+    }
+    pushReqQueue.push(wl);
+
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextAddrGenEvent()
+{
+    WorkListItem wl = pushReqQueue.front();
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < wl.degree; index++) {
+        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
+            }
+        }
+        else {
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
+        }
+    };
+
+    for (int index = 0; index < addr_queue.size(); index++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = wl.prop;
+        pendingReadReqs.push(pkt);
+    }
+
+    pushReadReqs.pop();
+
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
+        (!memPortBlocked())) {
+        PacketPtr pkt = pendingReadReqs.front();
+        sendMemReq(pkt);
+        onTheFlyReadReqs++;
+        pendingReadReqs.pop();
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    onTheFlyReadReqs--;
+    memRespQueue.push(pkt);
+
+    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextPushEvent()
+{
+    PacketPtr pkt = memRespQueue.front();
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        Edge e = memoryToEdge(curr_edge_data);
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
+            memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
+            // TODO: Erase map entries here.
+        }
+    }
+
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
     }
-    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 604df4750d..bf645eb119 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,15 +29,13 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class MPU;
-
-class PushEngine : public BasePushEngine
+class PushEngine : public BaseReadEngine
 {
   private:
     class ReqPort : public RequestPort
@@ -62,14 +60,38 @@ class PushEngine : public BasePushEngine
 
     ReqPort reqPort;
 
+    Addr baseEdgeAddr;
+
+    int pushReqQueueSize;
+    std::queue<WorkListItem> pushReqQueue;
+
+    // TODO: Possibility of infinite queueing
+    std::queue<PacketPtr> pendingReadReqs;
+
+    int memRespQueueSize;
+    int onTheFlyReadReqs;
+    std::queue<PacketPtr> memRespQueue;
+
+    EventFunctionWrapper nextAddrGenEvent;
+    void processNextAddrGenEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool recvWLItem(WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 03f74f1019..f0c522ff6f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,15 +28,22 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
+    BaseReduceEngine(params),
+    respPort(name() + ".resp_port", this),
+    blockedByCoalescer(false),
+    coaleseEngine(params.coalesce_engine),
+    updateQueueSize(params.update_queue_size),
+    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+{
+    coaleseEngine->registerWLEngine(this);
+}
 
 Port&
 WLEngine::getPort(const std::string &if_name, PortID idx)
@@ -44,7 +51,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "resp_port") {
         return respPort;
     } else {
-        return BaseWLEngine::getPort(if_name, idx);
+        return BaseReduceEngine::getPort(if_name, idx);
     }
 }
 
@@ -53,6 +60,8 @@ WLEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
+    //FIXME: The WLEngine no longer has a MemPort. Update this to
+    // work with the CoalesceEngine instead.
     WorkListItem vertices [5] = {
                                 {10000, 10000, 3, 0}, // Addr: 0
                                 {10000, 10000, 1, 3}, // Addr: 16
@@ -93,11 +102,6 @@ WLEngine::startup()
     handleWLUpdate(first_update);
 }
 
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -107,7 +111,7 @@ WLEngine::RespPort::getAddrRanges() const
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleWLUpdate(pkt);
+    return owner->handleIncomingUpdate(pkt);
 }
 
 Tick
@@ -131,26 +135,81 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
+    coaleseEngine->recvFunctional(pkt);
 }
 
-bool
-WLEngine::acquireAddress(Addr addr)
+AddrRangeList
+WLEngine::getAddrRanges()
 {
-    return lockDir->acquire(addr, requestorId);
+    return coaleseEngine->getAddrRanges();
+}
+
+void
+WLEngine::processNextReadEvent()
+{
+    PacketPtr update = updateQueue.front();
+    Addr update_addr = update->getAddr();
+    uint32_t update_value = update->getPtr<uint32_t>();
+
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
+        if (coalesceEngine->recvReadAddr(update_addr)) {
+            onTheFlyUpdateMap[update_addr] = update_value
+            updateQueue.pop();
+        }
+    } else {
+        // TODO: Generalize this to reduce function rather than just min
+        onTheFlyUpdateMap[update_addr] =
+                min(update_addr, onTheFlyUpdateMap[update_addr]);
+        updateQueue.pop();
+        // TODO: Add a stat to count the number of coalescions
+    }
+
+    if ((!nextReadEvent.scheduled()) &&
+        ((!updateQueue.empty()) ||
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    // TODO: Generalize this to reduce function rather than just min
+    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
+                                    currentWorkList.temp_prop);
+    // TODO: Add a delay here
+    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
+
+    onTheFlyUpdateMap.erase(currentWorkListAddress);
+    currentWorkListAddress = 0;
+    currentWorkList = {0, 0, 0, 0};
+}
+
+void
+WLEngine::scheduleReduceEvent()
+{
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 bool
-WLEngine::releaseAddress(Addr addr)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    return lockDir->release(addr, requestorId);
+    // TODO: Coalesce updates here too
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+        return false;
+    }
+
+    updateQueue.push(pkt);
+    if ((!nextReadEvent.scheduled()) &&
+        (!updateQueue.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4e8a25795a..1846825951 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -32,17 +32,14 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
+class WLEngine : public BaseReduceEngine
 {
   private:
     class RespPort : public ResponsePort
@@ -64,22 +61,40 @@ class WLEngine : public BaseWLEngine
     };
 
     RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
+
+    bool blockedByCoalescer;
+    CoalesceEngine* coaleseEngine;
+
+    int updateQueueSize;
+    std::queue<PacketPtr> updateQueue;
+
+    int onTheFlyUpdateMapSize;
+    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     virtual void startup();
+
     void recvFunctional(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges() const;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
   protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
+    virtual void scheduleReduceEvent() = 0;
 
   public:
     PARAMS(WLEngine);
+
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    bool handleIncomingUpdate(PacketPtr pkt);
 };
 
 }

From 2771b720062be50f00447560541b93aa423f486c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 16:01:55 -0700
Subject: [PATCH 070/279] Finalizing source code. Before compile.

---
 src/accl/graph/base/SConscript         |  12 +-
 src/accl/graph/sega/CoalesceEngine.py  |  40 ++++
 src/accl/graph/sega/PushEngine.py      |  40 ++++
 src/accl/graph/sega/SConscript         |   8 +-
 src/accl/graph/sega/WLEngine.py        |  40 ++++
 src/accl/graph/sega/coalesce_engine.cc | 306 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  30 ++-
 7 files changed, 377 insertions(+), 99 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine.py
 create mode 100644 src/accl/graph/sega/PushEngine.py
 create mode 100644 src/accl/graph/sega/WLEngine.py

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c5c8c4e901..c6a78eb5e8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,15 +27,11 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BaseEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseReadEngine.py')
+SimObject('BaseReduceEngine.py')
 
-Source('base_apply_engine.cc')
-Source('base_engine.cc')
-Source('base_push_engine.cc')
-Source('base_wl_engine.cc')
+Source('base_read_engine.cc')
+Source('base_reduce_engine.cc')
 Source('util.cc')
 
 DebugFlag('MPU')
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..0330da7576
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class CoalesceEngine(BaseReadEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+    
+    peer_push_engine = Param.PushEngine(NULL, "")
+    num_mshr_entry = Param.Int(4, "")
+    num_tgts_per_mshr = Param.Int(20, "")
+    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..9036b4e401
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class PushEngine(BaseReadEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    base_edge_addr = Param.Addr()
+    mem_resp_queue_size = Param.Int(0, "")
+    push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index e6d2f1fbbc..9b4629838b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,12 +27,12 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('LockDir.py')
+SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
-Source('apply_engine.cc')
-Source('lock_dir.cc')
+Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..ec9154b138
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    resp_port = ResponsePort("Port to Receive updates from outside")
+    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    update_queue_size = Param.Int(0, "")
+    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1f7a94dc7e..22bc0d49a6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,14 +29,17 @@
 #include "accl/sega/coalesce_engine.hh"
 
 #include "accl/sega/wl_engine.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
-    reqQueueSize(params.req_queue_size),
-    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    peerPushEngine(params.peer_push_engine),
+    numMSHREntry(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
@@ -59,69 +62,100 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
-    assert(reqQueue.size() <= reqQueueSize);
-    if (reqQueue.size() == reqQueueSize) {
-        return false;
-    }
-
-    reqQueue.push(addr);
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    // TODO: Investigate this for optimization
-    Addr addr = reqQueue.front();
+    assert(MSHRMap.size() <= numMSHREntry);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    if (cacheBlocks[block_index].allocated) {
+    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: I guess this piece of code code could be optimized.
-        // Not the code per se. The design it represents.
-        if (cacheBlocks[block_index].addr == alligned_addr) {
-            if (!cacheBlocks[block_index].taken[wl_offset]) {
-                if (cacheBlocks[block_index].valid) {
-                    peerWLEngine->handleIncomingWL(addr,
-                        cacheBlocks[block_index].items[wl_offset]);
-                    cacheBlocks[block_index].taken[wl_offset] = true;
-                } else {
-                    cacheBlocks[block_index].pending[wl_offset] = true;
-                }
-                reqQueue.pop();
-            }
-        } else { // conflict
-            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
-            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
-                cacheBlocks[block_index].numConflicts += 1;
-                conflictAddrQueue.push(addr);
-                reqQueue.pop();
-            }
+        addrResponseQueue.push(addr);
+        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        if ((!nextRespondEvent.scheduled()) &&
+            (!worklistResponseQueue.empty()) && 
+            (!addrResponseQueue.empty())) {
+            schedule(nextRespondEvent, nextCycle());
         }
+        return true;
     } else {
         // miss
-        cacheBlocks[block_index].addr = alligned_addr;
-        cacheBlocks[block_index].numConflicts = 0;
-        cacheBlocks[block_index].pending = {false, false, false, false};
-        cacheBlocks[block_index].pending[wl_offset] = true;
-        cacheBlocks[block_index].taken = {false, false, false, false};
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].allocated = true;
-
-        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
-
-        if (!memPortBlocked()) {
-            sendMemReq(pkt);
-            reqQueue.pop();
+        if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            if (MSHRMap.size() == numMSHREntry) {
+                // Out of MSHR entries
+                return false;
+            } else {
+                if (cacheBlock[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        return false;
+                    }
+                    // MSHR available but conflict
+                    cacheBlocks[block_index].hasConflict = true;
+                    MSHRMap[block_index].push_back(addr);
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    assert(
+                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueueSize);
+                    if (outstandingMemReqQueue.size() == 
+                        outstandingMemReqQueueSize) {
+                        return false;
+                    }
+                    cacheBlocks[block_index].addr = alligned_addr;
+                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].allocated = true;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].hasConflict = false;
+
+                    MSHRMap[block_index].push_back(addr);
+                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                                                64, _requestorId);
+                    outstandingMemReqQueue.push(pkt);
+
+                    if ((!nextMemReqEvent.scheduled()) &&
+                        (!outstandingMemReqQueue.empty())) {
+                        schedule(nextMemReqEvent, nextCycle());
+                    }
+                    return true;
+                }
+            }
         }
+    }   
+}
+
+void
+CoalesceEngine::processNextMemReqEvent()
+{
+    PacketPtr pkt = outstandingMemReqQueue.front();
+
+    if (!memPortBlocked()) {
+        sendMemReq(pkt);
+        outstandingMemReqQueue.pop();
+    }
+
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
     }
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    Addr addr_response = addrResponseQueue.front();
+    WorkListItem worklist_response = worklistResponseQueue.front();
+    
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+    addrResponseQueue.pop();
+    worklistResponseQueue.pop();
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
@@ -139,19 +173,50 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t data = pkt->getPtr<uint8_t>();
-
     int block_index = addr % 256;
+
+    assert((cacheBlocks[block_index].allocated) && // allocated cache block
+            (!cacheBlocks[block_index].valid) &&    // valid is false
+            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
     cacheBlocks[block_index].valid = true;
 
-    for (i = 0; i < 4; i++) {
+    for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
-        cacheBlocks[block_index].taken[i] = false;
-        if (cacheBlocks[block_index].pending[i]) {
-            peerWLEngine->handleIncomingWL(addr + (i * 16),
-                cacheBlocks[block_index].items[i]);
-            cacheBlocks[block_index].taken[i] = true;
+    }
+
+    int bias = 0;
+    std::vector<int> servicedIndices;
+    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
+        Addr miss_addr = MSHRMap[block_index][i];
+        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+
+        if (alligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+            addrResponseQueue.push(miss_addr);
+            worklistResponseQueue.push(
+                cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            servicedIndices.push_back(i);    
         }
-        cacheBlocks[block_index].pending = false;
+    }
+    // TODO: We Can use taken instead of this
+    for (int i = 0; i < servicedIndices.size(); i++) {
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+                                    servicedIndices[i] - bias);
+        bias++;
+    }
+
+    if (MSHRMap[block_index].empty()) {
+        MSHRMap.erase(block_index);
+        cacheBlocks[block_index].hasConflict = false;
+    } else {
+        cacheBlocks[block_index].hasConflict = true;
+    }
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
     }
 }
 
@@ -162,26 +227,111 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    assert(cacheBlocks[block_index].taken[wl_offset]);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+            (1 << wl_offset));
     cacheBlocks[block_index].item[wl_offset] = wl;
-    cacheBlocks[block_index].taken[wl_offset] = false;
-
-    bool taken_item = false;
-    taken_item &= (cacheBlocks[block_index].taken[0] &
-                    cacheBlocks[block_index].taken[1] &
-                    cacheBlocks[block_index].taken[2] &
-                    cacheBlocks[block_index].taken[3]);
-
-    if (!taken_item) {
-        for (auto conflictAddr : conflictAddrQueue) {
-            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
-            if (conflict_block_index == block_index) {
-                // Evict cacheBlocks[block_index]
-                // Respond to conflictAddr
-            }
+    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    
+    // TODO: Make this more general and programmable.
+    // && (cacheBlocks[block_index].hasConflict)
+    if ((cacheBlocks[block_index].takenMask == 0)) {
+        evictQueue.push(block_index);
+    }
+
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
+
+}
+
+void
+CoalesceEngine::processNextApplyAndCommitEvent()
+{
+    int block_index = evictQueue.front();
+    uint8_t changedMask = 0;
+    uint8_t data[64];
+
+    for (int i = 0; i < 4; i++) {
+        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+        cacheBlocks[block_index].items[i].prop = std::min(
+            cacheBlocks[block_index].items[i].prop,
+            cacheBlocks[block_index].items[i].temp_prop);
+        if (old_prop != cacheBlocks[block_index].items[i].prop) {
+            changedMask |= (1 << i);
         }
+        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
+        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
     }
 
+    if (changed) {
+        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        PacketPtr write_pkt = getWritePacket(
+            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        
+        if ((cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+            Addr miss_addr = MSHRMap[block_index][0];
+            // TODO: Make sure this trick works;
+            Addr alligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = getReadPacket(
+                    alligned_miss_addr, 64, _requestorId);
+            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push(read_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            evictQueue.pop();
+        } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            outstandingMemReqQueue.push(write_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            evictQueue.pop();
+        } else {
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+                __func__);
+        }
+    }
+    
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
+    }
+    
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0b349b2c1a..f5fd85e4cf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/base/base_read_engine.hh"
+#include "accl/sega/push_engine.hh"
 
 namespace gem5
 {
@@ -43,22 +44,33 @@ class CoalesceEngine : public BaseReadEngine
     {
         WorkListItem items[4];
         Addr addr;
-        int numConflicts;
-        bool pending[4];
-        bool taken[4];
-        bool valid;
+        uint8_t takenMask;
         bool allocated;
+        bool valid;
+        bool hasConflict;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
     };
 
     WLEngine* peerWLEngine;
-
+    PushEngine* peerPushEngine;
+    
     Block cacheBlocks[256];
 
-    int reqQueueSize;
-    std::queue<Addr> reqQueue;
+    int numMSHREntry;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHRMap;
+
+    int outstandingMemReqQueueSize;
+    std::queue<PacketPtr> outstandingMemReqQueue;
+
+    std::queue<Addr> addrResponseQueue;
+    std::queue<WorkListItem> worklistResponseQueue;
+
+    std::queue<int> evictQueue;
 
-    int conflictAddrQueueSize;
-    std::queue<Addr> conflictAddrQueue;
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();

From c0beedd26045ba5d6d90f898f90edacd29edd290 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 17:31:55 -0700
Subject: [PATCH 071/279] Compiles.

---
 src/accl/graph/base/SConscript            |  2 --
 src/accl/graph/base/base_read_engine.cc   |  4 +--
 src/accl/graph/base/base_read_engine.hh   | 11 ++++----
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  9 ++++---
 src/accl/graph/base/util.hh               |  5 ++++
 src/accl/graph/sega/PushEngine.py         |  2 +-
 src/accl/graph/sega/WLEngine.py           |  2 +-
 src/accl/graph/sega/coalesce_engine.cc    | 31 ++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh    | 10 +++++---
 src/accl/graph/sega/push_engine.cc        | 24 ++++++++++++++----
 src/accl/graph/sega/push_engine.hh        |  7 +++++
 src/accl/graph/sega/wl_engine.cc          | 29 +++++++++++----------
 src/accl/graph/sega/wl_engine.hh          |  4 +--
 14 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c6a78eb5e8..8aefca2185 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -33,5 +33,3 @@ SimObject('BaseReduceEngine.py')
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
 Source('util.cc')
-
-DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 4192cdb565..894831429b 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_read_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 
 namespace gem5
 {
@@ -35,7 +35,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    _requestorId(system.getRequestorId(this)),
+    _requestorId(system->getRequestorId(this))
 {}
 
 BaseReadEngine::~BaseReadEngine()
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 99f14bcb06..956c50e47d 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseEngine.hh"
+#include "params/BaseReadEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
@@ -53,7 +53,7 @@ class BaseReadEngine : public ClockedObject
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseEngine* owner):
+        MemPort(const std::string& name, BaseReadEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -69,8 +69,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    bool handleMemResp(PacketPtr resp);
-
   protected:
     const RequestorID _requestorId;
 
@@ -85,6 +83,7 @@ class BaseReadEngine : public ClockedObject
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
+    
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index fbfc613313..82643ba3ff 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_reduce_engine.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index e44f384f26..7851eaf585 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -26,11 +26,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
 
-#include "accl/base/util.hh"
+#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +43,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-    bool handleIncomingWL(Addr addr, WorkListItem wl);
 
   protected:
     Addr currentWorkListAddress;
@@ -60,6 +59,8 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
+
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index a4418a1cb8..1066d37d1c 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
+#define __ACCL_GRAPH_BASE_UTIL_HH__
+
 #include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
@@ -75,3 +78,5 @@ PacketPtr getUpdatePacket(Addr addr, unsigned int size,
                 uint8_t *data, RequestorID requestorId);
 
 }
+
+#endif // __ACCL_GRAPH_BASE_UTIL_HH__
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 9036b4e401..129d9454c7 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,6 +35,6 @@ class PushEngine(BaseReadEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr()
+    base_edge_addr = Param.Addr("")
     mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index ec9154b138..cab47fbe7b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,6 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    coalesce_engine = Param.CoalesceEngine(NULL, "")
     update_queue_size = Param.Int(0, "")
     on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 22bc0d49a6..663559cc63 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -26,9 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/sega/coalesce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 
-#include "accl/sega/wl_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
 
 namespace gem5
@@ -40,12 +40,13 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-CoalesceEngine::~CoalesceEngine()
-{}
+// CoalesceEngine::~CoalesceEngine()
+// {}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -86,8 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 return false;
             } else {
-                if (cacheBlock[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                if (cacheBlocks[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
                         return false;
                     }
@@ -122,6 +123,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     return true;
                 }
             }
+        } else {
+            assert(cacheBlocks[block_index].hasConflict);
+            MSHRMap[block_index].push_back(addr);
+            return true;
         }
     }   
 }
@@ -167,12 +172,12 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResp() && pkt->isWrite()) {
+    if (pkt->isResponse() && pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
     int block_index = addr % 256;
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
@@ -218,6 +223,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
+
+    return true;
 }
 
 void
@@ -229,7 +236,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
             (1 << wl_offset));
-    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     
     // TODO: Make this more general and programmable.
@@ -261,10 +268,10 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             changedMask |= (1 << i);
         }
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
+        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
 
-    if (changed) {
+    if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f5fd85e4cf..6086a8855e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,8 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/base/base_read_engine.hh"
-#include "accl/sega/push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "params/CoalesceEngine.hh"
 
 namespace gem5
 {
@@ -85,7 +87,7 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    ~CoalesceEngine();
+    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
@@ -93,7 +95,7 @@ class CoalesceEngine : public BaseReadEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
-}
+};
 
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c865451999..2a978cfcc5 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "debug/MPU.hh"
+
 namespace gem5
 {
 
@@ -35,8 +37,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     BaseReadEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    memRespQueueSize(params.mem_resp_queue_size),
     pushReqQueueSize(params.push_req_queue_size),
+    memRespQueueSize(params.mem_resp_queue_size),
     onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextReadEvent([this] { processNextReadEvent(); }, name()),
@@ -87,7 +89,7 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::recvWLItem(WorkListItem wl);
+PushEngine::recvWLItem(WorkListItem wl)
 {
     assert(pushReqQueue.size() <= pushReqQueueSize);
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
@@ -133,14 +135,14 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
         pendingReadReqs.push(pkt);
     }
 
-    pushReadReqs.pop();
+    pushReqQueue.pop();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -176,6 +178,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
+    return true;
 }
 
 void
@@ -199,7 +202,8 @@ PushEngine::processNextPushEvent()
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
+            _requestorId);
+
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
@@ -213,4 +217,14 @@ PushEngine::processNextPushEvent()
     }
 }
 
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index bf645eb119..e97a26c7bd 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -65,6 +66,10 @@ class PushEngine : public BaseReadEngine
     int pushReqQueueSize;
     std::queue<WorkListItem> pushReqQueue;
 
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+
     // TODO: Possibility of infinite queueing
     std::queue<PacketPtr> pendingReadReqs;
 
@@ -72,6 +77,8 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    bool sendPushUpdate(PacketPtr pkt);
+
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f0c522ff6f..43ad112db3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -36,13 +36,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
     blockedByCoalescer(false),
-    coaleseEngine(params.coalesce_engine),
+    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name())
 {
-    coaleseEngine->registerWLEngine(this);
+    coalesceEngine->registerWLEngine(this);
 }
 
 Port&
@@ -82,14 +82,14 @@ WLEngine::startup()
         uint8_t* data = workListToMemory(vertices[i]);
         PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     uint8_t* first_update_data = new uint8_t [4];
@@ -97,9 +97,9 @@ WLEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
+        0, 4, first_update_data, _requestorId);
 
-    handleWLUpdate(first_update);
+    handleIncomingUpdate(first_update);
 }
 
 AddrRangeList
@@ -135,13 +135,13 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    coaleseEngine->recvFunctional(pkt);
+    coalesceEngine->recvFunctional(pkt);
 }
 
 AddrRangeList
-WLEngine::getAddrRanges()
+WLEngine::getAddrRanges() const
 {
-    return coaleseEngine->getAddrRanges();
+    return coalesceEngine->getAddrRanges();
 }
 
 void
@@ -149,18 +149,18 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getPtr<uint32_t>();
+    uint32_t* update_value = update->getPtr<uint32_t>();
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
-            onTheFlyUpdateMap[update_addr] = update_value
+            onTheFlyUpdateMap[update_addr] = *update_value;
             updateQueue.pop();
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
         onTheFlyUpdateMap[update_addr] =
-                min(update_addr, onTheFlyUpdateMap[update_addr]);
+                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
         // TODO: Add a stat to count the number of coalescions
     }
@@ -176,8 +176,9 @@ void
 WLEngine::processNextReduceEvent()
 {
     // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
-                                    currentWorkList.temp_prop);
+    currentWorkList.temp_prop = std::min(
+                                onTheFlyUpdateMap[currentWorkListAddress],
+                                currentWorkList.temp_prop);
     // TODO: Add a delay here
     coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1846825951..3ce01dd69d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -63,7 +63,7 @@ class WLEngine : public BaseReduceEngine
     RespPort respPort;
 
     bool blockedByCoalescer;
-    CoalesceEngine* coaleseEngine;
+    CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
     std::queue<PacketPtr> updateQueue;
@@ -84,7 +84,7 @@ class WLEngine : public BaseReduceEngine
     void processNextReduceEvent();
 
   protected:
-    virtual void scheduleReduceEvent() = 0;
+    virtual void scheduleReduceEvent();
 
   public:
     PARAMS(WLEngine);

From c879b21b76647b105eafe4f1af523e5a578b33b3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:34:29 -0700
Subject: [PATCH 072/279] Debugging after compilation. Loop writting to mem

---
 configs/accl/sega.py                      | 28 +++++---
 src/accl/graph/base/base_reduce_engine.cc |  8 ---
 src/accl/graph/base/base_reduce_engine.hh |  4 +-
 src/accl/graph/sega/coalesce_engine.cc    | 83 +++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh    |  4 +-
 src/accl/graph/sega/push_engine.cc        |  5 +-
 src/accl/graph/sega/wl_engine.cc          | 60 +++++++++++-----
 src/accl/graph/sega/wl_engine.hh          |  6 +-
 8 files changed, 126 insertions(+), 72 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 163ea169d9..f71b0e73e0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,15 +4,12 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.lock_dir = LockDirectory()
-        self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
+        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
-
-        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
-        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
@@ -30,6 +27,16 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
@@ -40,8 +47,9 @@ def __init__(self):
 
         self.mpu = MPU()
         self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-
+        # self.mem_ctrl = MemCtrl()
+        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
+        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
 
@@ -50,6 +58,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate(1000000)
+exit_event = m5.simulate()
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 82643ba3ff..38a8662ed0 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -40,12 +40,4 @@ BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
 BaseReduceEngine::~BaseReduceEngine()
 {}
 
-void
-BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
-{
-    currentWorkListAddress = addr;
-    currentWorkList = wl;
-    scheduleReduceEvent();
-}
-
 }
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 7851eaf585..64d6e4c8c0 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -50,8 +50,6 @@ class BaseReduceEngine : public ClockedObject
 
     const RequestorID _requestorId;
 
-    virtual void scheduleReduceEvent() = 0;
-
   public:
     PARAMS(BaseReduceEngine);
 
@@ -60,7 +58,7 @@ class BaseReduceEngine : public ClockedObject
 
     RequestorID requestorId() { return _requestorId; }
 
-    void handleIncomingWL(Addr addr, WorkListItem wl);
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 663559cc63..aa6bc99887 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,8 +45,16 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-// CoalesceEngine::~CoalesceEngine()
-// {}
+void
+CoalesceEngine::startup()
+{
+    for (int i = 0; i < 256; i++) {
+        cacheBlocks[i].takenMask = 0;
+        cacheBlocks[i].allocated = false;
+        cacheBlocks[i].valid = false;
+        cacheBlocks[i].hasConflict = false;
+    }
+}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -64,6 +72,8 @@ bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
+    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+                                                    __func__, addr);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
@@ -71,11 +81,13 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == alligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
+                        , __func__, addr);
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) && 
+            (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -93,18 +105,26 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     // MSHR available but conflict
+                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
+                                "conflict. Making a request for "
+                                "alligned_addr: %lu.\n",
+                                __func__, addr, alligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
                     // MSHR available and no conflict
                     assert(
-                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
-                    if (outstandingMemReqQueue.size() == 
+                    if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
                         return false;
                     }
+                    DPRINTF(MPU, "%s: Read request with addr: "
+                                "%lu missed with no conflict. "
+                                "Making a request for alligned_addr: %lu.\n"
+                                , __func__, addr, alligned_addr);
                     cacheBlocks[block_index].addr = alligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
@@ -112,7 +132,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                    PacketPtr pkt = getReadPacket(alligned_addr,
                                                 64, _requestorId);
                     outstandingMemReqQueue.push(pkt);
 
@@ -124,11 +144,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
-            assert(cacheBlocks[block_index].hasConflict);
+            if ((!cacheBlocks[block_index].hasConflict) &&
+                ((addr < cacheBlocks[block_index].addr) ||
+                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                cacheBlocks[block_index].hasConflict = true;
+            }
             MSHRMap[block_index].push_back(addr);
             return true;
         }
-    }   
+    }
 }
 
 void
@@ -143,7 +167,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
 }
 
@@ -152,23 +176,19 @@ CoalesceEngine::processNextRespondEvent()
 {
     Addr addr_response = addrResponseQueue.front();
     WorkListItem worklist_response = worklistResponseQueue.front();
-    
+
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
     addrResponseQueue.pop();
     worklistResponseQueue.pop();
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
-/*
-    void recvWLWrite(Addr addr, WorkListItem wl);
-*/
-
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -183,11 +203,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
-    cacheBlocks[block_index].valid = true;
 
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
     }
+    cacheBlocks[block_index].valid = true;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -201,12 +221,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            servicedIndices.push_back(i);    
+            servicedIndices.push_back(i);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
     }
@@ -219,7 +239,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
@@ -233,12 +253,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
-
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
+                                    __func__, addr, wl.to_string());
+    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, alligned_addr,
+            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    
+
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
@@ -267,6 +291,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
+        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
+                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
+                    i, cacheBlocks[block_index].items[i].to_string());
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
         std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
@@ -275,7 +302,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
-        
+
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
@@ -304,7 +331,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop();
         } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
@@ -325,16 +352,16 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop();
         } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
     }
-    
+
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
-    
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6086a8855e..6dc7bc1001 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -56,7 +56,7 @@ class CoalesceEngine : public BaseReadEngine
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
-    
+
     Block cacheBlocks[256];
 
     int numMSHREntry;
@@ -71,6 +71,8 @@ class CoalesceEngine : public BaseReadEngine
 
     std::queue<int> evictQueue;
 
+    virtual void startup();
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 2a978cfcc5..06b5381641 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,6 +95,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
+
     pushReqQueue.push(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -204,10 +205,10 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             _requestorId);
 
+        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 43ad112db3..b7f59987cb 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -154,45 +154,70 @@ WLEngine::processNextReadEvent()
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
+            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                            "update_addr: %lu, update_value: %u.\n",
+                            __func__, update_addr, *update_value);
             onTheFlyUpdateMap[update_addr] = *update_value;
+            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
             updateQueue.pop();
+            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
+        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
+                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
+                            __func__, update_addr, *update_value,
+                            onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
+        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
 
     if ((!nextReadEvent.scheduled()) &&
-        ((!updateQueue.empty()) ||
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-WLEngine::processNextReduceEvent()
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = std::min(
-                                onTheFlyUpdateMap[currentWorkListAddress],
-                                currentWorkList.temp_prop);
-    // TODO: Add a delay here
-    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
-
-    onTheFlyUpdateMap.erase(currentWorkListAddress);
-    currentWorkListAddress = 0;
-    currentWorkList = {0, 0, 0, 0};
+    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    addrWorkListMap[addr] = wl;
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 void
-WLEngine::scheduleReduceEvent()
+WLEngine::processNextReduceEvent()
 {
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if (!nextReduceEvent.scheduled()) {
-        schedule(nextReduceEvent, nextCycle());
+
+    std::unordered_map<Addr, WorkListItem>::iterator it =
+                    addrWorkListMap.begin();
+
+    std::vector<Addr> servicedAddresses;
+    while (it != addrWorkListMap.end()) {
+        Addr addr = it->first;
+        WorkListItem wl = it->second;
+        uint32_t update_value = onTheFlyUpdateMap[addr];
+        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
+                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    onTheFlyUpdateMap[addr]);
+        // TODO: Generalize this to reduce function rather than just min
+        wl.temp_prop = std::min(update_value, wl.temp_prop);
+        coalesceEngine->recvWLWrite(addr, wl);
+        servicedAddresses.push_back(addr);
+        it++;
+    }
+
+    addrWorkListMap.clear();
+    for (int i = 0; i < servicedAddresses.size(); i++) {
+        onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
 }
 
@@ -206,6 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push(pkt);
+    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3ce01dd69d..1ccb13d91e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,6 +71,7 @@ class WLEngine : public BaseReduceEngine
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
+    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
     virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
@@ -83,9 +84,6 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
-  protected:
-    virtual void scheduleReduceEvent();
-
   public:
     PARAMS(WLEngine);
 
@@ -95,6 +93,8 @@ class WLEngine : public BaseReduceEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleIncomingUpdate(PacketPtr pkt);
+
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From 2f106dd1f9a2b2f73fdcd267660926e7b606373f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:51:48 -0700
Subject: [PATCH 073/279] Correctness tested with small graph.

---
 src/accl/graph/sega/coalesce_engine.cc | 23 ++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  2 +-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index aa6bc99887..62062116c2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -265,8 +265,19 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
+    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        evictQueue.push(block_index);
+        for (auto index : evictQueue) {
+            if (block_index == index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
+        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -329,7 +340,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
@@ -350,11 +363,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else {
             DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
+    } else {
+        evictQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6dc7bc1001..3290f646f4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -69,7 +69,7 @@ class CoalesceEngine : public BaseReadEngine
     std::queue<Addr> addrResponseQueue;
     std::queue<WorkListItem> worklistResponseQueue;
 
-    std::queue<int> evictQueue;
+    std::deque<int> evictQueue;
 
     virtual void startup();
 

From ea912f971c15e2b0261a7c9cbd5580b6d484da9d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 23 Mar 2022 09:53:26 -0700
Subject: [PATCH 074/279] Added performance statistics.

---
 src/accl/graph/sega/coalesce_engine.cc | 32 +++++++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 18 +++++++++++++++
 src/accl/graph/sega/wl_engine.cc       | 22 +++++++++++++++++-
 src/accl/graph/sega/wl_engine.hh       | 15 ++++++++++++
 4 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62062116c2..d58a36188e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,7 +42,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    stats(*this)
 {}
 
 void
@@ -86,6 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        stats.readHits++;
+        stats.numVertexReads++;
         if ((!nextRespondEvent.scheduled()) &&
             (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
@@ -138,6 +141,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
 
                     if ((!nextMemReqEvent.scheduled()) &&
                         (!outstandingMemReqQueue.empty())) {
+                        stats.numVertexBlockReads++;
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
@@ -221,6 +225,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            stats.numVertexReads++;
             servicedIndices.push_back(i);
         }
     }
@@ -262,6 +267,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    stats.numVertexWrites++;
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
@@ -376,6 +382,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
+        stats.numVertexBlockWrites++;
         schedule(nextMemReqEvent, nextCycle());
     }
 
@@ -385,4 +392,27 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     }
 }
 
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+
+    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
+             "Number of memory blocks writes for vertecies"),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3290f646f4..d45fffa3aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/base_read_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
 namespace gem5
@@ -82,6 +83,23 @@ class CoalesceEngine : public BaseReadEngine
     EventFunctionWrapper nextApplyAndCommitEvent;
     void processNextApplyAndCommitEvent();
 
+    struct CoalesceStats : public statistics::Group
+    {
+      CoalesceStats(CoalesceEngine &coalesce);
+
+      void regStats() override;
+
+      CoalesceEngine &coalesce;
+
+      statistics::Scalar numVertexBlockReads;
+      statistics::Scalar numVertexBlockWrites;
+      statistics::Scalar numVertexReads;
+      statistics::Scalar numVertexWrites;
+      statistics::Scalar readHits;
+    };
+
+    CoalesceStats stats;
+
   protected:
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b7f59987cb..517d10ef67 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    stats(*this)
 {
     coalesceEngine->registerWLEngine(this);
 }
@@ -171,6 +172,7 @@ WLEngine::processNextReadEvent()
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+        stats.onTheFlyCoalesce++;
         updateQueue.pop();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
@@ -209,6 +211,7 @@ WLEngine::processNextReduceEvent()
                     "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        stats.numReduce++;
         wl.temp_prop = std::min(update_value, wl.temp_prop);
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
@@ -239,4 +242,21 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
+WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
+    : statistics::Group(&_wl),
+    wl(_wl),
+
+    ADD_STAT(numReduce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1ccb13d91e..891916e7af 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -84,6 +85,20 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine &worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+
+      statistics::Scalar numReduce;
+      statistics::Scalar onTheFlyCoalesce;
+    };
+
+    WorkListStats stats;
+
   public:
     PARAMS(WLEngine);
 

From a3dd0fe07abde00525fbc5e0dfa916ad7cb8b720 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 14:10:40 -0700
Subject: [PATCH 075/279] Updating definitions for structs and removing
 unnecessary funcs.

---
 configs/accl/sega.py                      |  50 +++++---
 src/accl/graph/base/base_read_engine.cc   |  15 +++
 src/accl/graph/base/base_read_engine.hh   |   4 +-
 src/accl/graph/base/base_reduce_engine.hh |   2 -
 src/accl/graph/base/util.cc               | 145 ----------------------
 src/accl/graph/base/util.hh               |  54 ++++----
 src/accl/graph/sega/coalesce_engine.cc    |  98 ++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh    |  12 +-
 src/accl/graph/sega/push_engine.cc        |  42 ++++++-
 src/accl/graph/sega/push_engine.hh        |   4 +
 src/accl/graph/sega/wl_engine.cc          |  59 ++-------
 src/accl/graph/sega/wl_engine.hh          |   1 -
 12 files changed, 201 insertions(+), 285 deletions(-)
 delete mode 100644 src/accl/graph/base/util.cc

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index f71b0e73e0..8ea247106e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,9 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
-        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
+        self.push_engine = PushEngine(base_edge_addr=0x100000,
+                                    push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(
+                                    peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
+                                    update_queue_size = 16,
+                                    on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
@@ -27,31 +31,41 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
+class MPUMemory(SubSystem):
+    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+        super(MPUMemory, self).__init__()
+        self.vertex_mem_ctrl = SimpleMemory(
+            range=vertex_range, bandwidth="25GB/s",
+            latency="30ns", image_file=vertex_binary)
+        self.edge_mem_ctrl = SimpleMemory(
+            range=edge_range, bandwidth="25GB/s",
+            latency="30ns", image_file=edge_binary)
+        self.interconnect = SystemXBar()
+
+        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
+        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
 
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
+    def getPort(self):
+        return self.interconnect.cpu_side_ports
+    def setPort(self, port):
+        self.interconnect.cpu_side_ports = port
 
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl()
-        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
-        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
+        self.mem_ctrl = MPUMemory(
+            vertex_range=AddrRange(start=0x000000, size="2GiB"),
+            vertex_binary="live-journal/graph_binaries/vertices",
+            edge_range=AddrRange(start=0x80000000, size="2GiB"),
+            edge_binary="live-journal/graph_binaries/edgelist_0")
+
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.port)
+        self.mpu.setMemPort(self.mem_ctrl.getPort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 894831429b..a32237db35 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -83,4 +83,19 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+PacketPtr
+BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 956c50e47d..591b51aeb7 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -78,12 +78,14 @@ class BaseReadEngine : public ClockedObject
 
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+
   public:
     PARAMS(BaseReadEngine);
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
-    
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 64d6e4c8c0..f2245f571f 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -45,8 +45,6 @@ class BaseReduceEngine : public ClockedObject
 
 
   protected:
-    Addr currentWorkListAddress;
-    WorkListItem currentWorkList;
 
     const RequestorID _requestorId;
 
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
deleted file mode 100644
index 4172607ed0..0000000000
--- a/src/accl/graph/base/util.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/util.hh"
-
-namespace gem5
-{
-
-WorkListItem
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-
-    uint32_t temp_prop = *((uint32_t*) data);
-    uint32_t prop = *((uint32_t*) (data + 4));
-    uint32_t degree = *((uint32_t*) (data + 8));
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    wl  = {temp_prop, prop, degree, addr};
-    return wl;
-}
-
-uint8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
-    uint8_t* data = new uint8_t [data_size];
-
-    uint32_t* tempPtr = (uint32_t*) data;
-    *tempPtr = wl.temp_prop;
-
-    uint32_t* propPtr = (uint32_t*) (data + 4);
-    *propPtr = wl.prop;
-
-    uint32_t* degreePtr = (uint32_t*) (data + 8);
-    *degreePtr = wl.degree;
-
-    uint32_t* edgePtr = (uint32_t*) (data + 12);
-    *edgePtr = wl.edgeIndex;
-
-    return data;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge
-memoryToEdge(uint8_t *data)
-{
-    uint64_t weight = *((uint64_t*) data);
-    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
-    Edge e = {weight, neighbor};
-    return e;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-uint8_t*
-edgeToMemory(Edge e)
-{
-    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
-
-    uint8_t* data = new uint8_t [data_size];
-
-    uint64_t* weightPtr = (uint64_t*) data;
-    *weightPtr = e.weight;
-
-    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
-    *neighborPtr = e.neighbor;
-
-    return data;
-}
-
-PacketPtr
-getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-getWritePacket(Addr addr, unsigned int size,
-            uint8_t* data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-PacketPtr
-getUpdatePacket(Addr addr, unsigned int size,
-            uint8_t *data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-}
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index 1066d37d1c..b51a9f0781 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -30,52 +30,56 @@
 #define __ACCL_GRAPH_BASE_UTIL_HH__
 
 #include "base/cprintf.hh"
-#include "base/types.hh"
-#include "mem/packet.hh"
-#include "mem/request.hh"
 
 namespace gem5
 {
 
-struct WorkListItem
+struct __attribute__ ((packed)) WorkListItem
 {
-    uint32_t temp_prop;
-    uint32_t prop;
-    uint32_t degree;
-    uint32_t edgeIndex;
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t degree : 32;
+    uint32_t edgeIndex : 32;
 
     std::string to_string()
     {
         return csprintf(
         "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        temp_prop, prop, degree, edgeIndex);
+        tempProp, prop, degree, edgeIndex);
     }
 
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        degree(0),
+        edgeIndex(0)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index):
+        tempProp(temp_prop),
+        prop(prop),
+        degree(degree),
+        edgeIndex(edge_index)
+    {}
+
 };
 
-struct Edge
+struct __attribute__ ((packed)) Edge
 {
-    uint64_t weight;
-    Addr neighbor;
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
 
     std::string to_string()
     {
         return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
     }
-};
 
-WorkListItem memoryToWorkList(uint8_t* data);
-uint8_t* workListToMemory(WorkListItem wl);
-
-Edge memoryToEdge(uint8_t* data);
-uint8_t* edgeToMemory(Edge e);
-
-PacketPtr getReadPacket(Addr addr, unsigned int size,
-                            RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size,
-                uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size,
-                uint8_t *data, RequestorID requestorId);
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d58a36188e..67874cb9b9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,29 +75,33 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
+        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+
         stats.readHits++;
         stats.numVertexReads++;
-        if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) &&
-            (!addrResponseQueue.empty())) {
+
+        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
         return true;
     } else {
         // miss
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
                 return false;
@@ -110,12 +114,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     // MSHR available but conflict
                     DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
                                 "conflict. Making a request for "
-                                "alligned_addr: %lu.\n",
-                                __func__, addr, alligned_addr);
+                                "aligned_addr: %lu.\n",
+                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
+                    // TODO: Set valid to false every deallocation and
+                    // assert valid == false here.
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
@@ -126,31 +132,34 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     }
                     DPRINTF(MPU, "%s: Read request with addr: "
                                 "%lu missed with no conflict. "
-                                "Making a request for alligned_addr: %lu.\n"
-                                , __func__, addr, alligned_addr);
-                    cacheBlocks[block_index].addr = alligned_addr;
+                                "Making a request for aligned_addr: %lu.\n"
+                                , __func__, addr, aligned_addr);
+                    cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr,
-                                                64, _requestorId);
+                    // TODO: Parameterize 64 to memory atom size
+                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
                     outstandingMemReqQueue.push(pkt);
 
-                    if ((!nextMemReqEvent.scheduled()) &&
-                        (!outstandingMemReqQueue.empty())) {
-                        stats.numVertexBlockReads++;
+                    stats.numVertexBlockReads++;
+
+                    assert(!outstandingMemReqQueue.empty());
+                    if (!nextMemReqEvent.scheduled()) {
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
                 }
             }
         } else {
+            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                return false;
+            }
             if ((!cacheBlocks[block_index].hasConflict) &&
-                ((addr < cacheBlocks[block_index].addr) ||
-                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                (aligned_addr != cacheBlocks[block_index].addr)) {
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
@@ -196,20 +205,24 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResponse() && pkt->isWrite()) {
+    assert(pkt->isResponse());
+    if (pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256;
+    int block_index = addr % 256; // TODO: After parameterizing the cache size
+                                  // this 256 number should change to the cache
+                                  // size parameter.
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
     for (int i = 0; i < 4; i++) {
-        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
+                                data + (i * sizeof(WorkListItem))));
     }
     cacheBlocks[block_index].valid = true;
 
@@ -252,16 +265,32 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+PacketPtr
+CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / 16;
     DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
                                     __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, alligned_addr,
+    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, aligned_addr,
             block_index, wl_offset, cacheBlocks[block_index].takenMask);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -298,35 +327,36 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 {
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
+    // TODO: parameterize 64 to memory atom size
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].temp_prop);
+            cacheBlocks[block_index].items[i].tempProp);
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
+        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        std::memcpy(data + (i * sizeof(WorkListItem)),
+                    wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-        PacketPtr write_pkt = getWritePacket(
-            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        PacketPtr write_pkt = createWritePacket(
+            cacheBlocks[block_index].addr, 64, data);
 
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = getReadPacket(
-                    alligned_miss_addr, 64, _requestorId);
+            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
             outstandingMemReqQueue.push(write_pkt);
             outstandingMemReqQueue.push(read_pkt);
             // TODO: This should be improved
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index d45fffa3aa..4bb21676d4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -35,6 +35,8 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+// TODO: Add parameters for size, memory atom size, type size,
+// length of items in the blocks.
 namespace gem5
 {
 
@@ -53,6 +55,13 @@ class CoalesceEngine : public BaseReadEngine
         bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
+        Block():
+          addr(0),
+          takenMask(0),
+          allocated(false),
+          valid(false),
+          hasConflict(false)
+        {}
     };
 
     WLEngine* peerWLEngine;
@@ -74,6 +83,8 @@ class CoalesceEngine : public BaseReadEngine
 
     virtual void startup();
 
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
@@ -107,7 +118,6 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 06b5381641..d09da113ee 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -57,6 +57,19 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+
+    sendPushUpdate(first_update);
+}
+
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -136,7 +149,7 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
+        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
@@ -182,6 +195,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
 void
 PushEngine::processNextPushEvent()
 {
@@ -196,17 +210,16 @@ PushEngine::processNextPushEvent()
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
+        Edge* e = (Edge*) (curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            _requestorId);
+        PacketPtr update = createUpdatePacket(e->neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
+                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.
@@ -218,6 +231,23 @@ PushEngine::processNextPushEvent()
     }
 }
 
+PacketPtr
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e97a26c7bd..81acc9862b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -77,6 +77,10 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    virtual void startup();
+
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
     bool sendPushUpdate(PacketPtr pkt);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 517d10ef67..b874ec65ec 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -56,53 +56,6 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    //FIXME: The WLEngine no longer has a MemPort. Update this to
-    // work with the CoalesceEngine instead.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, _requestorId);
-
-    handleIncomingUpdate(first_update);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -152,6 +105,7 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t* update_value = update->getPtr<uint32_t>();
 
+    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -178,6 +132,7 @@ WLEngine::processNextReadEvent()
         // TODO: Add a stat to count the number of coalescions
     }
 
+    // TODO: Only schedule nextReadEvent only when it has to be scheduled
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
@@ -208,11 +163,12 @@ WLEngine::processNextReduceEvent()
         WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        wl.tempProp = std::min(update_value, wl.tempProp);
         stats.numReduce++;
-        wl.temp_prop = std::min(update_value, wl.temp_prop);
+
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
         it++;
@@ -227,16 +183,15 @@ WLEngine::processNextReduceEvent()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    // TODO: Coalesce updates here too
     assert(updateQueue.size() <= updateQueueSize);
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
     updateQueue.push(pkt);
+    assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
     return true;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 891916e7af..ef18956ec1 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -73,7 +73,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     std::unordered_map<Addr, WorkListItem> addrWorkListMap;
-    virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
 

From 7570393e83e9ed3406a8516e91016476124a063a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 18:03:35 -0700
Subject: [PATCH 076/279] Fixing base_edge_addr in config and debugs.

---
 configs/accl/sega.py               |  6 +++---
 src/accl/graph/base/SConscript     |  1 -
 src/accl/graph/sega/push_engine.cc | 11 +++++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8ea247106e..680157ba7e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,7 +4,7 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000,
+        self.push_engine = PushEngine(base_edge_addr=0x80000000,
                                     push_req_queue_size = 16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
@@ -60,9 +60,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="live-journal/graph_binaries/vertices",
+            vertex_binary="epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="live-journal/graph_binaries/edgelist_0")
+            edge_binary="epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8aefca2185..ea96f4323b 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,4 +32,3 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
-Source('util.cc')
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d09da113ee..c305a4bbb9 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -203,23 +203,26 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
+            __func__, pkt->getAddr());
+
     Addr offset = reqOffsetMap[req];
     int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
         Edge* e = (Edge*) (curr_edge_data);
+        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
+        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+                __func__, e->neighbor, *update_data);
         PacketPtr update = createUpdatePacket(e->neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
-        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.

From 179340c7b7294baa21822681a682bc4605a2ee9b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 31 Mar 2022 19:00:29 -0700
Subject: [PATCH 077/279] Changing queue to deque

---
 src/accl/graph/base/base_read_engine.hh |  1 -
 src/accl/graph/sega/coalesce_engine.cc  | 22 +++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh  |  6 +++---
 src/accl/graph/sega/push_engine.cc      | 12 ++++++------
 src/accl/graph/sega/push_engine.hh      |  6 +++---
 src/accl/graph/sega/wl_engine.cc        |  6 +++---
 src/accl/graph/sega/wl_engine.hh        |  2 +-
 7 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 591b51aeb7..e21aaa01d2 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
-#include <queue>
 #include <unordered_map>
 
 #include "base/addr_range.hh"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 67874cb9b9..9fed1e8230 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,8 +85,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push(addr);
-        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        addrResponseQueue.push_back(addr);
+        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -143,7 +143,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
-                    outstandingMemReqQueue.push(pkt);
+                    outstandingMemReqQueue.push_back(pkt);
 
                     stats.numVertexBlockReads++;
 
@@ -175,7 +175,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
-        outstandingMemReqQueue.pop();
+        outstandingMemReqQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
@@ -192,8 +192,8 @@ CoalesceEngine::processNextRespondEvent()
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    addrResponseQueue.pop();
-    worklistResponseQueue.pop();
+    addrResponseQueue.pop_front();
+    worklistResponseQueue.pop_front();
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -234,8 +234,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
         if (alligned_miss_addr == addr) {
             int wl_offset = (miss_addr - alligned_miss_addr) / 16;
-            addrResponseQueue.push(miss_addr);
-            worklistResponseQueue.push(
+            addrResponseQueue.push_back(miss_addr);
+            worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
@@ -357,8 +357,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
-            outstandingMemReqQueue.push(write_pkt);
-            outstandingMemReqQueue.push(read_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
+            outstandingMemReqQueue.push_back(read_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -381,7 +381,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
-            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4bb21676d4..2cb9856f76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,10 +74,10 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
-    std::queue<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> outstandingMemReqQueue;
 
-    std::queue<Addr> addrResponseQueue;
-    std::queue<WorkListItem> worklistResponseQueue;
+    std::deque<Addr> addrResponseQueue;
+    std::deque<WorkListItem> worklistResponseQueue;
 
     std::deque<int> evictQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c305a4bbb9..450ba9ddc4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,7 +109,7 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push(wl);
+    pushReqQueue.push_back(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -153,10 +153,10 @@ PushEngine::processNextAddrGenEvent()
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push(pkt);
+        pendingReadReqs.push_back(pkt);
     }
 
-    pushReqQueue.pop();
+    pushReqQueue.pop_front();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -175,7 +175,7 @@ PushEngine::processNextReadEvent()
         PacketPtr pkt = pendingReadReqs.front();
         sendMemReq(pkt);
         onTheFlyReadReqs++;
-        pendingReadReqs.pop();
+        pendingReadReqs.pop_front();
     }
 
     if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
@@ -187,7 +187,7 @@ bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
     onTheFlyReadReqs--;
-    memRespQueue.push(pkt);
+    memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
@@ -224,7 +224,7 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
+            memRespQueue.pop_front();
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 81acc9862b..1b1a812d16 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,18 +64,18 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::queue<WorkListItem> pushReqQueue;
+    std::deque<WorkListItem> pushReqQueue;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     // TODO: Possibility of infinite queueing
-    std::queue<PacketPtr> pendingReadReqs;
+    std::deque<PacketPtr> pendingReadReqs;
 
     int memRespQueueSize;
     int onTheFlyReadReqs;
-    std::queue<PacketPtr> memRespQueue;
+    std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b874ec65ec..73eacf945f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -115,7 +115,7 @@ WLEngine::processNextReadEvent()
             onTheFlyUpdateMap[update_addr] = *update_value;
             DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                 __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop();
+            updateQueue.pop_front();
             DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
@@ -127,7 +127,7 @@ WLEngine::processNextReadEvent()
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
-        updateQueue.pop();
+        updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
@@ -188,7 +188,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push(pkt);
+    updateQueue.push_back(pkt);
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index ef18956ec1..c1ef028f77 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -67,7 +67,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::queue<PacketPtr> updateQueue;
+    std::deque<PacketPtr> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From d654ddb0dfc0a2b3edf12cbdda084798378fb069 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 20:25:31 -0700
Subject: [PATCH 078/279] Removing old files and renaming utils to
 data_structs.

---
 src/accl/graph/base/base_reduce_engine.hh     |   5 -
 .../graph/base/{util.hh => data_structs.hh}   |   0
 src/accl/graph/base/old/BaseApplyEngine.py    |  36 ----
 src/accl/graph/base/old/BaseEngine.py         |  39 -----
 src/accl/graph/base/old/BasePushEngine.py     |  36 ----
 src/accl/graph/base/old/BaseWLEngine.py       |  36 ----
 src/accl/graph/base/old/base_apply_engine.cc  | 137 ---------------
 src/accl/graph/base/old/base_apply_engine.hh  |  72 --------
 src/accl/graph/base/old/base_engine.cc        | 100 -----------
 src/accl/graph/base/old/base_engine.hh        |  98 -----------
 src/accl/graph/base/old/base_push_engine.cc   | 145 ----------------
 src/accl/graph/base/old/base_push_engine.hh   |  82 ---------
 src/accl/graph/base/old/base_wl_engine.cc     | 134 ---------------
 src/accl/graph/base/old/base_wl_engine.hh     |  83 ----------
 src/accl/graph/sega/coalesce_engine.hh        |   2 +-
 src/accl/graph/sega/old/ApplyEngine.py        |  38 -----
 src/accl/graph/sega/old/LockDir.py            |  46 ------
 src/accl/graph/sega/old/PushEngine.py         |  37 -----
 src/accl/graph/sega/old/WLEngine.py           |  40 -----
 src/accl/graph/sega/old/apply_engine.cc       |  58 -------
 src/accl/graph/sega/old/apply_engine.hh       |  67 --------
 src/accl/graph/sega/old/lock_dir.cc           |  63 -------
 src/accl/graph/sega/old/lock_dir.hh           |  57 -------
 src/accl/graph/sega/old/push_engine.cc        |  90 ----------
 src/accl/graph/sega/old/push_engine.hh        |  77 ---------
 src/accl/graph/sega/old/wl_engine.cc          | 156 ------------------
 src/accl/graph/sega/old/wl_engine.hh          |  86 ----------
 src/accl/graph/sega/push_engine.hh            |   2 +-
 src/accl/graph/sega/wl_engine.hh              |   3 +-
 29 files changed, 4 insertions(+), 1821 deletions(-)
 rename src/accl/graph/base/{util.hh => data_structs.hh} (100%)
 delete mode 100644 src/accl/graph/base/old/BaseApplyEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseEngine.py
 delete mode 100644 src/accl/graph/base/old/BasePushEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseWLEngine.py
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_push_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_push_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.hh
 delete mode 100644 src/accl/graph/sega/old/ApplyEngine.py
 delete mode 100644 src/accl/graph/sega/old/LockDir.py
 delete mode 100644 src/accl/graph/sega/old/PushEngine.py
 delete mode 100644 src/accl/graph/sega/old/WLEngine.py
 delete mode 100644 src/accl/graph/sega/old/apply_engine.cc
 delete mode 100644 src/accl/graph/sega/old/apply_engine.hh
 delete mode 100644 src/accl/graph/sega/old/lock_dir.cc
 delete mode 100644 src/accl/graph/sega/old/lock_dir.hh
 delete mode 100644 src/accl/graph/sega/old/push_engine.cc
 delete mode 100644 src/accl/graph/sega/old/push_engine.hh
 delete mode 100644 src/accl/graph/sega/old/wl_engine.cc
 delete mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index f2245f571f..c8c9784ed1 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
-
-#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +41,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-
   protected:
 
     const RequestorID _requestorId;
@@ -55,8 +52,6 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
-
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/data_structs.hh
similarity index 100%
rename from src/accl/graph/base/util.hh
rename to src/accl/graph/base/data_structs.hh
diff --git a/src/accl/graph/base/old/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
deleted file mode 100644
index 9b240581ac..0000000000
--- a/src/accl/graph/base/old/BaseApplyEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseApplyEngine(BaseEngine):
-    abstract = True
-    type = 'BaseApplyEngine'
-    cxx_header = 'accl/graph/base/base_apply_engine.hh'
-    cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/old/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
deleted file mode 100644
index 16c2f402e5..0000000000
--- a/src/accl/graph/base/old/BaseEngine.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseEngine(ClockedObject):
-    abstract = True
-    type = 'BaseEngine'
-    cxx_header = "accl/graph/base/base_engine.hh"
-    cxx_class = 'gem5::BaseEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/old/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
deleted file mode 100644
index 2163864be3..0000000000
--- a/src/accl/graph/base/old/BasePushEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BasePushEngine(BaseEngine):
-    abstract = True
-    type = 'BasePushEngine'
-    cxx_header = "accl/graph/base/base_push_engine.hh"
-    cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/old/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
deleted file mode 100644
index 7311c396b3..0000000000
--- a/src/accl/graph/base/old/BaseWLEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseWLEngine(BaseEngine):
-    abstract = True
-    type = 'BaseWLEngine'
-    cxx_header = "accl/graph/base/base_wl_engine.hh"
-    cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/old/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
deleted file mode 100644
index 39f5dafc67..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_apply_engine.hh"
-
-#include <string>
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-
-namespace gem5
-{
-
-BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
-{}
-
-bool
-BaseApplyEngine::recvWLNotif(Addr addr)
-{
-    // TODO: Investigate the situation where the queue is full.
-    applyReadQueue.push(addr);
-    if (!nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-BaseApplyEngine::processNextApplyCheckEvent()
-{
-    // TODO: We might want to change the way this function
-    // pops items off queue, maybe we should pop every n cycles
-    // or change the clock domain for this simobject.
-    Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffset[memPkt->req] = req_offset;
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            applyReadQueue.pop();
-        }
-    }
-    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::processNextApplyEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    RequestPtr request = pkt->req;
-    Addr request_offset = requestOffset[request];
-
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
-                , __func__, pkt->getAddr() + request_offset, wl.to_string());
-    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
-    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
-    // fwd it to applyengine?
-    if (wl.temp_prop < wl.prop) {
-        // TODO: instead of min add a Reduce function.
-        //update prop with temp_prop
-        wl.prop = wl.temp_prop;
-        //write back the new worklist item to  memory
-        uint8_t* wList = workListToMemory(wl);
-        memcpy(data + request_offset, wList, sizeof(WorkListItem));
-        //Create memory write requests.
-        PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-
-        if (!memPortBlocked()) {
-            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
-                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
-            }
-        }
-    } else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(pkt->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
deleted file mode 100644
index f4df298079..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.hh
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BaseApplyEngine.hh"
-
-namespace gem5
-{
-
-class BaseApplyEngine : public BaseEngine
-{
-  private:
-    std::queue<Addr> applyReadQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffset;
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-            uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-    PARAMS(BaseApplyEngine);
-
-    BaseApplyEngine(const BaseApplyEngineParams &apply);
-
-    bool recvWLNotif(Addr addr);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
deleted file mode 100644
index ad87bb3662..0000000000
--- a/src/accl/graph/base/old/base_engine.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-BaseEngine::BaseEngine(const BaseEngineParams &params) :
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this))
-{
-    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
-}
-
-BaseEngine::~BaseEngine()
-{}
-
-Port&
-BaseEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
-
-}
-
-void
-BaseEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-BaseEngine::handleMemResp(PacketPtr pkt)
-{
-    if (pkt->isResponse() && pkt->isWrite()) {
-        return true;
-    }
-    memRespQueue.push(pkt);
-    scheduleMainEvent();
-    return true;
-}
-
-}
diff --git a/src/accl/graph/base/old/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
deleted file mode 100644
index 53415ddc7c..0000000000
--- a/src/accl/graph/base/old/base_engine.hh
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    bool handleMemResp(PacketPtr resp);
-
-  protected:
-    const RequestorID requestorId;
-    // TODO: Add this later, maybe?
-    // int memRespQueueSize;
-    std::queue<PacketPtr> memRespQueue;
-
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    virtual void scheduleMainEvent() = 0;
-
-  public:
-    PARAMS(BaseEngine);
-
-    BaseEngine(const BaseEngineParams &params);
-    ~BaseEngine();
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
deleted file mode 100644
index 4ebe40e486..0000000000
--- a/src/accl/graph/base/old/base_push_engine.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_push_engine.hh"
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-namespace gem5
-{
-
-BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
-{}
-
-bool
-BasePushEngine::recvApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edge_index)
-{
-    notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
-    return true;
-}
-
-void
-BasePushEngine::processNextReadEvent()
-{
-    ApplyNotif notif = notifQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < notif.degree; index++) {
-        // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        if (!memPortBlocked()) {
-            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-            reqOffsetMap[pkt->req] = offset_queue[index];
-            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-            reqValueMap[pkt->req] = notif.prop;
-            sendMemReq(pkt);
-            notifQueue.pop();
-        }
-    }
-
-    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::processNextPushEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
-    uint32_t value = reqValueMap[req];
-
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
-            // TODO: Erase map entries here.
-        }
-    }
-
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
deleted file mode 100644
index 01027d2791..0000000000
--- a/src/accl/graph/base/old/base_push_engine.hh
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-
-#include <queue>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BasePushEngine.hh"
-
-namespace gem5
-{
-
-class BasePushEngine : public BaseEngine
-{
-  private:
-    struct ApplyNotif {
-        uint32_t prop;
-        uint32_t degree;
-        uint32_t edgeIndex;
-
-        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
-        prop(prop), degree(degree), edgeIndex(edge_index)
-        {}
-    };
-
-    std::queue<ApplyNotif> notifQueue;
-    // int notifQueueSize;
-
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
-
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
-    EventFunctionWrapper nextPushEvent;
-    void processNextPushEvent();
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BasePushEngine);
-
-    BasePushEngine(const BasePushEngineParams &params);
-
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
deleted file mode 100644
index fd45b85077..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "debug/MPU.hh"
-
-#include <string>
-
-namespace gem5
-{
-
-BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    BaseEngine(params),
-    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
-{}
-
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt)
-{
-    updateQueue.push(pkt);
-    if(!nextWLReadEvent.scheduled()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
-
-void BaseWLEngine::processNextWLReadEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    uint32_t value = *(pkt->getPtr<uint32_t>());
-
-    Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
-
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffsetMap[memPkt->req] = req_offset;
-        requestValueMap[memPkt->req] = value;
-
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            updateQueue.pop();
-        }
-        else{
-            releaseAddress(req_addr);
-        }
-    }
-    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::processNextWLReduceEvent()
-{
-    PacketPtr resp = memRespQueue.front();
-    uint8_t* respData = resp->getPtr<uint8_t>();
-    Addr request_offset = requestOffsetMap[resp->req];
-    uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(respData + request_offset);
-
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
-                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
-    if (value < wl.temp_prop){
-        //update prop with temp_prop
-        wl.temp_prop = value;
-
-        uint8_t* wlData = workListToMemory(wl);
-        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
-        PacketPtr writePkt  =
-        getWritePacket(resp->getAddr(), 64, respData, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-        if (!memPortBlocked()) {
-            if (sendWLNotif(resp->getAddr() + request_offset)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
-                , __func__, wl.to_string());
-                // TODO: Erase map entries, delete wlData;
-            }
-        }
-    }
-    else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(resp->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
-            schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-
-}
diff --git a/src/accl/graph/base/old/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
deleted file mode 100644
index 15371f965b..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.hh
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "accl/graph/base/util.hh"
-#include "params/BaseWLEngine.hh"
-
-namespace gem5
-{
-
-class BaseWLEngine : public BaseEngine
-{
-  private:
-    std::queue<PacketPtr> updateQueue;
-    std::queue<PacketPtr> responseQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
-    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
-
-    //Events
-    EventFunctionWrapper nextWLReadEvent;
-    void processNextWLReadEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
-    EventFunctionWrapper nextWLReduceEvent;
-    void processNextWLReduceEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-  protected:
-    virtual bool sendWLNotif(Addr addr) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BaseWLEngine);
-
-    BaseWLEngine(const BaseWLEngineParams &params);
-
-    bool handleWLUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2cb9856f76..ff30efde4c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
diff --git a/src/accl/graph/sega/old/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
deleted file mode 100644
index 7a446bb620..0000000000
--- a/src/accl/graph/sega/old/ApplyEngine.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseApplyEngine import BaseApplyEngine
-
-class ApplyEngine(BaseApplyEngine):
-    type = 'ApplyEngine'
-    cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::ApplyEngine'
-
-    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/LockDir.py b/src/accl/graph/sega/old/LockDir.py
deleted file mode 100644
index d21963dc3a..0000000000
--- a/src/accl/graph/sega/old/LockDir.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2012-2014, 2017-2018 ARM Limited
-# All rights reserved.
-#
-# The license below extends only to copyright in the software and shall
-# not be construed as granting a license to any other intellectual
-# property including but not limited to intellectual property relating
-# to a hardware implementation of the functionality of the software
-# licensed hereunder.  You may use the software subject to the license
-# terms below provided that you ensure that this notice is replicated
-# unmodified and in its entirety in all distributions of the software,
-# modified or unmodified, in source code or in binary form.
-#
-# Copyright (c) 2007 The Regents of The University of Michigan
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.SimObject import SimObject
-
-class LockDirectory(SimObject):
-    type = 'LockDirectory'
-    cxx_header = 'accl/graph/sega/lock_dir.hh'
-    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/old/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
deleted file mode 100644
index a743b57262..0000000000
--- a/src/accl/graph/sega/old/PushEngine.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BasePushEngine import BasePushEngine
-
-class PushEngine(BasePushEngine):
-    type = 'PushEngine'
-    cxx_header = "accl/graph/sega/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
-
-    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/old/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
deleted file mode 100644
index b6e697266e..0000000000
--- a/src/accl/graph/sega/old/WLEngine.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseWLEngine import BaseWLEngine
-
-class WLEngine(BaseWLEngine):
-    type = 'WLEngine'
-    cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
-
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    apply_engine = Param.ApplyEngine(Parent.any,
-            "MPU object that owns this WLEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
deleted file mode 100644
index 544bb082ad..0000000000
--- a/src/accl/graph/sega/old/apply_engine.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/apply_engine.hh"
-
-namespace gem5{
-
-ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
-    BaseApplyEngine(params),
-    pushEngine(params.push_engine),
-    lockDir(params.lock_dir)
-{}
-
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
-
-}
-
-bool
-ApplyEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-ApplyEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
deleted file mode 100644
index c88330487a..0000000000
--- a/src/accl/graph/sega/old/apply_engine.hh
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "accl/graph/sega/push_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/ApplyEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-
-namespace gem5
-{
-
-
-class ApplyEngine : public BaseApplyEngine
-{
-  private:
-    PushEngine* pushEngine;
-    LockDirectory* lockDir;
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edgeIndex) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(ApplyEngine);
-    ApplyEngine(const ApplyEngineParams &params);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
deleted file mode 100644
index 6a4496175d..0000000000
--- a/src/accl/graph/sega/old/lock_dir.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/lock_dir.hh"
-
-namespace gem5
-{
-
-LockDirectory::LockDirectory(const LockDirectoryParams &params) :
-    SimObject(params)
-{}
-
-bool
-LockDirectory::acquire(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        lockOwnerMap[addr] = requestorId;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-bool
-LockDirectory::release(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        panic("Should not relase an address before acquiring");
-    } else if (lockOwnerMap[addr] != requestorId) {
-        panic("Should not release and address you don't own");
-    } else {
-        lockOwnerMap.erase(addr);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
deleted file mode 100644
index 012334ce43..0000000000
--- a/src/accl/graph/sega/old/lock_dir.hh
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-
-#include <unordered_map>
-
-#include "mem/packet.hh"
-#include "params/LockDirectory.hh"
-#include "sim/sim_object.hh"
-
-namespace gem5
-{
-
-class LockDirectory: public SimObject
-{
-  private:
-    std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    // std::unordered_map<Addr, int> lockDegreeMap;
-
-  public:
-    PARAMS(LockDirectory);
-    LockDirectory(const LockDirectoryParams &params);
-
-    bool acquire(Addr addr, RequestorID requestorId);
-    bool release(Addr addr, RequestorID requestorId);
-};
-
-}
-
-#endif
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
deleted file mode 100644
index c7b229ad33..0000000000
--- a/src/accl/graph/sega/old/push_engine.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/push_engine.hh"
-
-namespace gem5
-{
-
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
-{}
-
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BasePushEngine::getPort(if_name, idx);
-    }
-}
-
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
deleted file mode 100644
index 604df4750d..0000000000
--- a/src/accl/graph/sega/old/push_engine.hh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-
-#include "accl/graph/base/base_push_engine.hh"
-#include "params/PushEngine.hh"
-
-namespace gem5
-{
-
-class MPU;
-
-class PushEngine : public BasePushEngine
-{
-  private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
-
-  public:
-    PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
deleted file mode 100644
index 03f74f1019..0000000000
--- a/src/accl/graph/sega/old/wl_engine.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseWLEngine::getPort(if_name, idx);
-    }
-}
-
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
-
-    handleWLUpdate(first_update);
-}
-
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
-}
-
-bool
-WLEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-WLEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
deleted file mode 100644
index 4e8a25795a..0000000000
--- a/src/accl/graph/sega/old/wl_engine.hh
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "params/WLEngine.hh"
-
-namespace gem5
-{
-
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
-{
-  private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
-
-    virtual void startup();
-    void recvFunctional(PacketPtr pkt);
-
-  protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(WLEngine);
-    WLEngine(const WLEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1b1a812d16..4c9822345f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c1ef028f77..a8dff32d44 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -108,7 +109,7 @@ class WLEngine : public BaseReduceEngine
 
     bool handleIncomingUpdate(PacketPtr pkt);
 
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From dde65c8115e8efe993eca11ba93ee7cfac49ef5e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 1 Apr 2022 11:07:05 -0700
Subject: [PATCH 079/279] Fixing bugs.

---
 configs/accl/sega.py               |   9 +--
 src/accl/graph/sega/push_engine.cc | 110 +++++++++++++++--------------
 src/accl/graph/sega/push_engine.hh |   6 +-
 src/accl/graph/sega/wl_engine.cc   |  23 +++---
 4 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 680157ba7e..a0c7766fe0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,11 +5,12 @@ class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=0x80000000,
-                                    push_req_queue_size = 16)
+                                    push_req_queue_size=16,
+                                    mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size = 16,
+                                    update_queue_size=16,
                                     on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
@@ -60,9 +61,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="epinions/graph_binaries/vertices",
+            vertex_binary="facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="epinions/graph_binaries/edgelist_0")
+            edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 450ba9ddc4..0b4c981d48 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -65,6 +65,7 @@ PushEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -109,7 +110,11 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push_back(wl);
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t update_value = wl.prop;
+    pushReqQueue.push_back(
+        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -121,43 +126,36 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    WorkListItem wl = pushReqQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < wl.degree; index++) {
-        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push_back(pkt);
+    Addr start_addr, end_addr;
+    uint32_t update_value;
+
+    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
+    std::tie(start_addr, end_addr) = front.first;
+    update_value = front.second;
+
+    Addr req_addr = (start_addr / 64) * 64;
+    Addr req_offset = start_addr % 64;
+    int num_edges = 0;
+
+    if (end_addr > req_addr + 64) {
+        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
+    } else {
+        num_edges = (end_addr - start_addr) / sizeof(Edge);
     }
+    PacketPtr pkt = createReadPacket(req_addr, 64);
+    reqOffsetMap[pkt->req] = req_offset;
+    reqNumEdgeMap[pkt->req] = num_edges;
+    reqValueMap[pkt->req] = update_value;
+    pendingReadReqs.push_back(pkt);
 
     pushReqQueue.pop_front();
 
+    if (req_addr + 64 < end_addr) {
+        pushReqQueue.push_front(
+        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
+        );
+    }
+
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
@@ -207,26 +205,30 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr());
 
     Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
-        Edge* e = (Edge*) (curr_edge_data);
-        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-                __func__, e->neighbor, *update_data);
-        PacketPtr update = createUpdatePacket(e->neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
-
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop_front();
-            // TODO: Erase map entries here.
-        }
+    Edge* e = (Edge*) (data + offset);
+    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+    // TODO: Implement propagate function here
+    *update_data = value + 1;
+    // uint32_t update_value = value + 1;
+    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+            __func__, e->neighbor, *update_data);
+    PacketPtr update = createUpdatePacket(e->neighbor,
+                        sizeof(uint32_t), (uint8_t*) update_data);
+
+    if (sendPushUpdate(update)) {
+        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
+        reqNumEdgeMap[req]--;
+    }
+
+    if (reqNumEdgeMap[req] == 0) {
+        memRespQueue.pop_front();
+        reqOffsetMap.erase(req);
+        reqNumEdgeMap.erase(req);
+        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -235,7 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -247,6 +250,7 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 
     pkt->allocate();
     pkt->setData(data);
+    // pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4c9822345f..faee5128b7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,8 +64,9 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<WorkListItem> pushReqQueue;
+    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
 
+    // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
@@ -79,7 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 73eacf945f..117abb61e8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -106,17 +106,18 @@ WLEngine::processNextReadEvent()
     uint32_t* update_value = update->getPtr<uint32_t>();
 
     // FIXME: else logic is wrong
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
-        if (coalesceEngine->recvReadAddr(update_addr)) {
-            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                            "update_addr: %lu, update_value: %u.\n",
-                            __func__, update_addr, *update_value);
-            onTheFlyUpdateMap[update_addr] = *update_value;
-            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop_front();
-            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
+                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                                "update_addr: %lu, update_value: %u.\n",
+                                __func__, update_addr, *update_value);
+                onTheFlyUpdateMap[update_addr] = *update_value;
+                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                updateQueue.pop_front();
+                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+            }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min

From 71c6b5dd5b397449207d2a2b8a8a2b6dda2a2668 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 15:39:56 -0700
Subject: [PATCH 080/279] Updating createUpdatePacket.

---
 src/accl/graph/TODO.md                 |  8 ++++++++
 src/accl/graph/sega/coalesce_engine.cc | 17 ++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 26 +++++++++++++-------------
 src/accl/graph/sega/push_engine.hh     |  4 ++--
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++------
 6 files changed, 36 insertions(+), 34 deletions(-)
 create mode 100644 src/accl/graph/TODO.md

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..d5effbeb96
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* use setLE/setBE inside createUpdatePacket and createWritePacket
+* parameterize cache size, associativity, maybe latencies,
+and memory atom size in the coalesce engine
+* look at all the simobjects and come up with a general architecture. Make
+sure all the simobjects follow that architecture.
+* implement all the communications between simobjects as req/retry.
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 9fed1e8230..8d97fffd20 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -300,19 +301,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
-    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        for (auto index : evictQueue) {
-            if (block_index == index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            evictQueue.push_back(block_index);
-        }
-        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
+        evictQueue.push_back(block_index);
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -328,6 +318,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
+    uint8_t* wl_data;
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
@@ -341,7 +332,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ff30efde4c..5c4e752cbf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,6 +84,7 @@ class CoalesceEngine : public BaseReadEngine
     virtual void startup();
 
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0b4c981d48..870b32f2fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -64,8 +65,8 @@ PushEngine::startup()
     uint32_t* tempPtr = (uint32_t*) first_update_data;
     *tempPtr = 0;
 
-    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -193,7 +194,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
+// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPushEvent()
 {
@@ -209,15 +210,14 @@ PushEngine::processNextPushEvent()
 
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+
     // TODO: Implement propagate function here
-    *update_data = value + 1;
-    // uint32_t update_value = value + 1;
+    uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, *update_data);
+            __func__, e->neighbor, update_value);
+
     PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), (uint8_t*) update_data);
+                        sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
@@ -237,8 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -249,8 +249,8 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
-    pkt->setData(data);
-    // pkt->setLE<uint32_t>(value);
+    // pkt->setData(data);
+    pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index faee5128b7..a539079ede 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -80,8 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 117abb61e8..3a6911c1bf 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,9 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
+
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -103,7 +105,7 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t* update_value = update->getPtr<uint32_t>();
+    uint32_t update_value = update->getLE<uint32_t>();
 
     // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
@@ -111,8 +113,8 @@ WLEngine::processNextReadEvent()
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
                                 "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, *update_value);
-                onTheFlyUpdateMap[update_addr] = *update_value;
+                                __func__, update_addr, update_value);
+                onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
@@ -123,10 +125,10 @@ WLEngine::processNextReadEvent()
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
                             "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, *update_value,
+                            __func__, update_addr, update_value,
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
-                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+                std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
@@ -154,7 +156,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-
     std::unordered_map<Addr, WorkListItem>::iterator it =
                     addrWorkListMap.begin();
 
@@ -190,6 +191,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
+
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {

From 95cf1b8700c01991754d3be647d5b95f524fe0e3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 17:26:15 -0700
Subject: [PATCH 081/279] Adding retry to wle respPort and debug.

---
 src/accl/graph/sega/push_engine.cc | 13 +++++++++----
 src/accl/graph/sega/wl_engine.cc   | 31 +++++++++++++++++++++++++-----
 src/accl/graph/sega/wl_engine.hh   |  3 +++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 870b32f2fb..70d6242f5b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,10 +95,12 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
+    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+
     _blocked = false;
     sendPacket(blockedPacket);
 
-    if (!blocked()) {
+    if (!_blocked) {
         blockedPacket = nullptr;
     }
 }
@@ -202,12 +204,13 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
-            __func__, pkt->getAddr());
-
     Addr offset = reqOffsetMap[req];
     uint32_t value = reqValueMap[req];
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+                "offset: %lu\n",
+            __func__, pkt->getAddr(), offset);
+
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
@@ -220,6 +223,8 @@ PushEngine::processNextPushEvent()
                         sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
+        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
+                    __func__, e->neighbor, update_value);
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
         reqNumEdgeMap[req]--;
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3a6911c1bf..27c7ad4fea 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -64,10 +64,25 @@ WLEngine::RespPort::getAddrRanges() const
     return owner->getAddrRanges();
 }
 
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleIncomingUpdate(pkt);
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
 }
 
 Tick
@@ -107,7 +122,6 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
 
-    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -118,7 +132,11 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                if (updateQueue.size() == updateQueueSize - 1) {
+                    respPort.checkRetryReq();
+                }
+
             }
         }
     } else {
@@ -131,8 +149,10 @@ WLEngine::processNextReadEvent()
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-        // TODO: Add a stat to count the number of coalescions
+        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        if (updateQueue.size() == updateQueueSize - 1) {
+            respPort.checkRetryReq();
+        }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -180,6 +200,7 @@ WLEngine::processNextReduceEvent()
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
+    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index a8dff32d44..476c9be932 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,6 +48,7 @@ class WLEngine : public BaseReduceEngine
     {
       private:
         WLEngine* owner;
+        bool needSendRetryReq;
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
@@ -55,6 +56,8 @@ class WLEngine : public BaseReduceEngine
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        void checkRetryReq();
+
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);

From af47d0a201b176ee42e208cba3a432f7e40bbf86 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 5 Apr 2022 09:20:52 -0700
Subject: [PATCH 082/279] Debugging coalesce engine deadlock.

---
 src/accl/graph/base/data_structs.hh    |   8 +-
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   2 +
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |  71 ++++---
 5 files changed, 254 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index b51a9f0781..dacb74e38c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    std::string to_string()
-    {
-        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
-    }
+    // std::string to_string()
+    // {
+    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    // }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8d97fffd20..d7fa806fff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -41,6 +41,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
@@ -77,17 +79,21 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
-                        , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
+        worklistResponseQueue.push_back(
+            cacheBlocks[block_index].items[wl_offset]);
+        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset,
+            worklistResponseQueue.size(),
+            cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -101,50 +107,72 @@ CoalesceEngine::recvReadAddr(Addr addr)
         return true;
     } else {
         // miss
+        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+                        "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
+                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                            "Rejecting request.\n", __func__);
                 return false;
             } else {
+                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                                    "Rejecting request.\n",
+                                    __func__, block_index);
                         return false;
                     }
-                    // MSHR available but conflict
-                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
-                                "conflict. Making a request for "
-                                "aligned_addr: %lu.\n",
-                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
                     // TODO: Set valid to false every deallocation and
-                    // assert valid == false here.
+                    assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
+                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                                "allocate a cache line for it.\n",
+                                __func__, addr);
                     if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
+                                    "(outstandingMemReqQueue.size: %u). "
+                                    "Rejecting  request.\n", __func__,
+                                    outstandingMemReqQueue.size());
                         return false;
                     }
-                    DPRINTF(MPU, "%s: Read request with addr: "
-                                "%lu missed with no conflict. "
-                                "Making a request for aligned_addr: %lu.\n"
-                                , __func__, addr, aligned_addr);
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
+                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                                "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d].\n", __func__, addr, block_index);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
+                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                                " req addr (aligned_addr) = %lu, size = 64.\n",
+                                __func__, addr, aligned_addr);
                     outstandingMemReqQueue.push_back(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
+                                "outstandingMemReqQueue.size = %d", __func__,
+                                outstandingMemReqQueue.size());
 
                     stats.numVertexBlockReads++;
 
@@ -156,14 +184,24 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+                        "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                            "Rejecting request.\n",
+                            __func__, block_index);
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
+                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                            "with Addr: %lu.\n", __func__, addr,
+                            cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
+            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                            "line[%d].\n", __func__, addr, block_index);
             return true;
         }
     }
@@ -176,9 +214,24 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
+        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
+                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
         outstandingMemReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
+                    "outstandingMemReqQueue.size = %u.\n", __func__,
+                    outstandingMemReqQueue.size());
     }
 
+    if ((alarmRequested) &&
+        (outstandingMemReqQueue.size() <
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        schedule(nextApplyAndCommitEvent, nextCycle());
+        DPRINTF(MPU, "%s: There is an alarm request for "
+        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
+        "nextApplyAndCommitEvent.\n", __func__);
+    }
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
         schedule(nextMemReqEvent, nextCycle());
@@ -192,9 +245,14 @@ CoalesceEngine::processNextRespondEvent()
     WorkListItem worklist_response = worklistResponseQueue.front();
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
 
     addrResponseQueue.pop_front();
     worklistResponseQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
+                "worklistResponseQueue.size = %d.\n", __func__,
+                worklistResponseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -208,15 +266,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256; // TODO: After parameterizing the cache size
-                                  // this 256 number should change to the cache
-                                  // size parameter.
+    // TODO: After parameterizing the cache size
+    // this 256 number should change to the cache
+    // size parameter.
+    int block_index = (addr / 64) % 256;
 
+    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+                __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
@@ -224,6 +287,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
+        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+                block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
 
@@ -231,29 +296,42 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = (miss_addr / 64) * 64;
 
-        if (alligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+        if (aligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+                        "be serviced with the received packet.\n",
+                        __func__, miss_addr, block_index);
             addrResponseQueue.push_back(miss_addr);
             worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
+            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    , __func__, block_index, wl_offset,
+                    worklistResponseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
+            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+                        "removal.\n", __func__, i, block_index);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
+        Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
+        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+                    __func__, print_addr);
     }
 
     if (MSHRMap[block_index].empty()) {
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
+        // TODO: I think this is unnecessary.
         cacheBlocks[block_index].hasConflict = true;
     }
 
@@ -286,27 +364,33 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
-    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
-                                    __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, aligned_addr,
-            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+
+    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+                __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+                cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
+        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+                    " It does not have any taken items anymore.\n",
+                    __func__, block_index);
         evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
+        (!evictQueue.empty())&&
+        ((!alarmRequested) && (spaceRequested == 0))) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -315,90 +399,163 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
+    assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
     uint8_t* wl_data;
     uint8_t data[64];
 
+    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
+                __func__, block_index);
+    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
+                "then commited.\n", __func__, block_index);
+
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+
+    } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else {
+        alarmRequested = true;
+        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
+        __func__, spaceRequested);
+        return;
+    }
+
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
             cacheBlocks[block_index].items[i].tempProp);
+        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                    block_index, i,
+                    cacheBlocks[block_index].items[i].to_string());
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
+            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                        __func__, block_index, i);
         }
-        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
-                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
-                    i, cacheBlocks[block_index].items[i].to_string());
         wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
+        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                    , __func__, block_index);
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = createWritePacket(
             cacheBlocks[block_index].addr, 64, data);
-
-        if ((cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                    __func__, write_pkt->getAddr());
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(
+                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
+            );
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
             Addr miss_addr = MSHRMap[block_index][0];
-            // TODO: Make sure this trick works;
-            Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
             outstandingMemReqQueue.push_back(write_pkt);
             outstandingMemReqQueue.push_back(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue. "
+                        "outstandingMemReqQueue.size = %u.\n", __func__,
+                        outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+        } else {
+            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
             outstandingMemReqQueue.push_back(write_pkt);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
+                        "%u.\n", __func__, outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
-                __func__);
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
+        cacheBlocks[block_index].takenMask = 0;
+        cacheBlocks[block_index].allocated = false;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = false;
+        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                    "backs are necessary. Deallocated cache line[%d].\n",
+                    __func__, block_index, block_index);
         evictQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 5c4e752cbf..902a960301 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,6 +74,8 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     std::deque<Addr> addrResponseQueue;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70d6242f5b..c9ed781d79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -212,7 +212,7 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr(), offset);
 
     Edge* e = (Edge*) (data + offset);
-    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27c7ad4fea..ea45cae652 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -68,7 +68,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -121,43 +121,49 @@ WLEngine::processNextReadEvent()
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
+    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+                "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+                    __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
-                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                                "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, update_value);
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
+                            update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                            ". updateQueue.size = %u.\n",
+                            __func__, updateQueue.size());
                 if (updateQueue.size() == updateQueueSize - 1) {
                     respPort.checkRetryReq();
                 }
-
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
-                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, update_value,
-                            onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
         if (updateQueue.size() == updateQueueSize - 1) {
             respPort.checkRetryReq();
         }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -166,9 +172,14 @@ void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+
     addrWorkListMap[addr] = wl;
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
+                __func__, addr, wl.to_string());
+
+    assert(!addrWorkListMap.empty());
+    if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
 }
@@ -182,25 +193,32 @@ WLEngine::processNextReduceEvent()
     std::vector<Addr> servicedAddresses;
     while (it != addrWorkListMap.end()) {
         Addr addr = it->first;
-        WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
-                    onTheFlyUpdateMap[addr]);
+        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
+                    "addrWorkListMap[%lu] = %s.\n", __func__,
+                                addr, onTheFlyUpdateMap[addr],
+                                addr, addrWorkListMap[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        wl.tempProp = std::min(update_value, wl.tempProp);
+        addrWorkListMap[addr].tempProp =
+                    std::min(update_value, addrWorkListMap[addr].tempProp);
+        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+                    __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, wl);
+        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         servicedAddresses.push_back(addr);
+        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
+                    __func__, addr);
         it++;
     }
 
     addrWorkListMap.clear();
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, servicedAddresses[i]);
     }
-    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
@@ -212,9 +230,10 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
-
+    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
     assert(!updateQueue.empty());
-    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }

From 98b03f4af6a9ec15c14291478ea5d1a2948655ec Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 7 Apr 2022 15:06:58 -0700
Subject: [PATCH 083/279] Restructing inheritance and fixiing inf queue.

---
 configs/accl/sega.py                    |   6 +-
 src/accl/graph/TODO.md                  |   1 +
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |  83 ++++++++
 src/accl/graph/base/base_read_engine.hh |  18 +-
 src/accl/graph/base/data_structs.hh     |   8 +-
 src/accl/graph/sega/CoalesceEngine.py   |   3 +-
 src/accl/graph/sega/PushEngine.py       |   1 -
 src/accl/graph/sega/coalesce_engine.cc  | 254 ++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +-
 src/accl/graph/sega/push_engine.cc      | 142 ++++++-------
 src/accl/graph/sega/push_engine.hh      |  55 +++--
 src/accl/graph/sega/wl_engine.cc        |  10 +-
 src/accl/graph/sega/wl_engine.hh        |   2 +-
 14 files changed, 348 insertions(+), 254 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0c7766fe0..8e24280366 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,9 +2,9 @@
 from m5.objects import *
 
 class MPU(SubSystem):
-    def __init__(self):
+    def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x80000000,
+        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
                                     push_req_queue_size=16,
                                     mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
@@ -58,7 +58,7 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = MPU()
+        self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
             vertex_binary="facebook/graph_binaries/vertices",
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index d5effbeb96..a0e2cefeff 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -6,3 +6,4 @@ and memory atom size in the coalesce engine
 * look at all the simobjects and come up with a general architecture. Make
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
+* get rid of maps with RequestPtr as keys
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 84c53465b9..3ddab2d3c4 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -37,3 +37,6 @@ class BaseReadEngine(ClockedObject):
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
+
+    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
+                                    "which memory requests are queued.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index a32237db35..e3b588cfc6 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/base/base_read_engine.hh"
 
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -35,6 +36,10 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this))
 {}
 
@@ -83,6 +88,31 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+void
+BaseReadEngine::processNextMemReqEvent()
+{
+    if (memPort.blocked()) {
+        return;
+    }
+
+    // TODO: Maybe add a DPRINTF here.
+    PacketPtr pkt = outstandingMemReqQueue.front();
+    memPort.sendPacket(pkt);
+    outstandingMemReqQueue.pop_front();
+
+    if (alarmRequested &&
+        (outstandingMemReqQueue.size() <=
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        respondToAlarm();
+    }
+
+    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 PacketPtr
 BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 {
@@ -98,4 +128,57 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
+PacketPtr
+BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+bool
+BaseReadEngine::memReqQueueHasSpace(int space)
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (
+        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        );
+}
+
+bool
+BaseReadEngine::memReqQueueFull()
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+}
+
+void
+BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+{
+    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    outstandingMemReqQueue.push_back(pkt);
+
+    assert(!outstandingMemReqQueue.empty());
+    if (!nextMemReqEvent.scheduled()) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
+void
+BaseReadEngine::requestAlarm(int space) {
+    panic_if((alarmRequested == true) || (spaceRequested != 0),
+            "You should not request another alarm without the first one being"
+            "responded to.\n");
+    alarmRequested = true;
+    spaceRequested = space;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index e21aaa01d2..bec922beef 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,16 +68,30 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
+    std::deque<PacketPtr> outstandingMemReqQueue;
+
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
+
   protected:
     const RequestorID _requestorId;
 
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    bool memReqQueueHasSpace(int space);
+    bool memReqQueueFull();
+    void enqueueMemReq(PacketPtr pkt);
+    bool pendingAlarm() { return alarmRequested; }
+    void requestAlarm(int space);
+
+    virtual void respondToAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
     PARAMS(BaseReadEngine);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index dacb74e38c..28a503528f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    // std::string to_string()
-    // {
-    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
-    // }
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 0330da7576..bec7e3d233 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -33,8 +33,7 @@ class CoalesceEngine(BaseReadEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
-    
+
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
-    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 129d9454c7..645bc5f4ea 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -36,5 +36,4 @@ class PushEngine(BaseReadEngine):
 
     req_port  = RequestPort("Port to send updates to the outside")
     base_edge_addr = Param.Addr("")
-    mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7fa806fff..015629245b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -40,10 +40,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
@@ -85,14 +81,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(
-            cacheBlocks[block_index].items[wl_offset]);
+        responseQueue.push_back(std::make_tuple(addr,
+                    cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            worklistResponseQueue.size(),
+            "to responseQueue. responseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
@@ -100,7 +93,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
         stats.readHits++;
         stats.numVertexReads++;
 
-        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -136,21 +129,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
-                    // TODO: Set valid to false every deallocation and
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    assert(
-                        outstandingMemReqQueue.size() <=
-                        outstandingMemReqQueueSize);
+                    //TODO: Fix this to work with new inheritance.
+                    // assert(
+                    //     outstandingMemReqQueue.size() <=
+                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (outstandingMemReqQueue.size() ==
-                        outstandingMemReqQueueSize) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
-                                    "(outstandingMemReqQueue.size: %u). "
-                                    "Rejecting  request.\n", __func__,
-                                    outstandingMemReqQueue.size());
+                    if (memReqQueueFull()) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                                    "Rejecting  request.\n", __func__);
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -169,17 +159,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = 64.\n",
                                 __func__, addr, aligned_addr);
-                    outstandingMemReqQueue.push_back(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
-                                "outstandingMemReqQueue.size = %d", __func__,
-                                outstandingMemReqQueue.size());
-
+                    enqueueMemReq(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                                                                    __func__);
                     stats.numVertexBlockReads++;
-
-                    assert(!outstandingMemReqQueue.empty());
-                    if (!nextMemReqEvent.scheduled()) {
-                        schedule(nextMemReqEvent, nextCycle());
-                    }
                     return true;
                 }
             }
@@ -207,65 +190,41 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
-void
-CoalesceEngine::processNextMemReqEvent()
-{
-    PacketPtr pkt = outstandingMemReqQueue.front();
-
-    if (!memPortBlocked()) {
-        sendMemReq(pkt);
-        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
-                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
-        outstandingMemReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
-                    "outstandingMemReqQueue.size = %u.\n", __func__,
-                    outstandingMemReqQueue.size());
-    }
-
-    if ((alarmRequested) &&
-        (outstandingMemReqQueue.size() <
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        schedule(nextApplyAndCommitEvent, nextCycle());
-        DPRINTF(MPU, "%s: There is an alarm request for "
-        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
-        "nextApplyAndCommitEvent.\n", __func__);
-    }
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
 void
 CoalesceEngine::processNextRespondEvent()
 {
-    Addr addr_response = addrResponseQueue.front();
-    WorkListItem worklist_response = worklistResponseQueue.front();
+    Addr addr_response;
+    WorkListItem worklist_response;
 
+    std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
-    addrResponseQueue.pop_front();
-    worklistResponseQueue.pop_front();
+    responseQueue.pop_front();
     DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
                 "worklistResponseQueue.size = %d.\n", __func__,
-                worklistResponseQueue.size());
+                responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
+void
+CoalesceEngine::respondToAlarm()
+{
+    assert(!nextApplyAndCommitEvent.scheduled());
+    schedule(nextApplyAndCommitEvent, nextCycle());
+}
+
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        delete pkt;
         DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
@@ -291,6 +250,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
+    delete pkt;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -303,13 +263,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
-            addrResponseQueue.push_back(miss_addr);
-            worklistResponseQueue.push_back(
-                cacheBlocks[block_index].items[wl_offset]);
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
-                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
-                    worklistResponseQueue.size());
+                    responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
@@ -336,8 +295,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 
@@ -363,7 +321,8 @@ CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr aligned_addr = (addr / 64) * 64;
+    // TODO: Parameterize all the numbers here.
+    Addr aligned_addr = std::floor(addr / 64) * 64;
     int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
 
@@ -371,6 +330,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
+
+    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+        cacheBlocks[block_index].hasChange = true;
+    }
+
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
@@ -378,7 +342,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    // && (cacheBlocks[block_index].hasConflict)
+    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
+    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -389,8 +354,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())&&
-        ((!alarmRequested) && (spaceRequested == 0))) {
+        (!evictQueue.empty()) &&
+        (pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -399,36 +364,45 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
-    assert((!alarmRequested) && (spaceRequested == 0));
+    // FIXME: Refactor the line below to work with the new inheritance.
+    // assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
-    // TODO: parameterize 64 to memory atom size
-    uint8_t* wl_data;
-    uint8_t data[64];
 
     DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
 
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+    if ((cacheBlocks[block_index].hasChange)&&
+        (cacheBlocks[block_index].hasConflict) &&
+        (memReqQueueHasSpace(2))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
-
-    } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+    } else if ((cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict)) {
+        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                    __func__, block_index);
     } else {
-        alarmRequested = true;
-        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        requestAlarm(spaceNeeded);
         DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
-        __func__, spaceRequested);
+        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+        __func__, spaceNeeded);
         return;
     }
 
+    // Reducing between tempProp and prop for each item in the cache line.
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
@@ -442,23 +416,18 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                         __func__, block_index, i);
         }
-        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
-        std::memcpy(data + (i * sizeof(WorkListItem)),
-                    wl_data, sizeof(WorkListItem));
     }
 
-    if (changedMask) {
+    if (cacheBlocks[block_index].hasChange) {
         DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                     , __func__, block_index);
-        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        // TODO: Parameterize this 64 to memory atom size
         PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64, data);
+            cacheBlocks[block_index].addr, 64,
+            (uint8_t*) cacheBlocks[block_index].items);
         DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
                     __func__, write_pkt->getAddr());
         if (cacheBlocks[block_index].hasConflict) {
-            assert(
-                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
-            );
             DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                         "enough space in outstandingMemReqQueue for the write "
                         "back packet and its subsequent read packet.\n",
@@ -467,18 +436,19 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                         " Addr: %lu.\n", __func__, block_index, miss_addr);
             // TODO: parameterize 64
-            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
             DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                         " req addr (aligned_addr) = %lu, size = 64.\n",
                         __func__, miss_addr, aligned_miss_addr);
-            outstandingMemReqQueue.push_back(write_pkt);
-            outstandingMemReqQueue.push_back(read_pkt);
+
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
+            enqueueMemReq(read_pkt);
             DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                         "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue. "
-                        "outstandingMemReqQueue.size = %u.\n", __func__,
-                        outstandingMemReqQueue.size());
+                        " to outstandingMemReqQueue.\n" , __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -500,22 +470,25 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+            // TODO: This should be improved
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         } else {
-            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
             DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
                     "enough space in outstandingMemReqQueue for the write back"
                     " packet.\n", __func__, block_index);
-            outstandingMemReqQueue.push_back(write_pkt);
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
             DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
-                        "%u.\n", __func__, outstandingMemReqQueue.size());
+                        "outstandingMemReqQueue.\n", __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -537,33 +510,58 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+
+            // Since allocated is false, does not matter what the address is.
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        cacheBlocks[block_index].takenMask = 0;
-        cacheBlocks[block_index].allocated = false;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = false;
         DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary. Deallocated cache line[%d].\n",
+                    "backs are necessary.\n",
                     __func__, block_index, block_index);
-        evictQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
-    }
+        if (cacheBlocks[block_index].hasConflict) {
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
+            enqueueMemReq(read_pkt);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                        "deallocating the line.\n", __func__, block_index);
 
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        stats.numVertexBlockWrites++;
-        schedule(nextMemReqEvent, nextCycle());
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
+        }
     }
 
+    evictQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 902a960301..6a8aadcbae 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -53,6 +53,7 @@ class CoalesceEngine : public BaseReadEngine
         bool allocated;
         bool valid;
         bool hasConflict;
+        bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block():
@@ -60,7 +61,8 @@ class CoalesceEngine : public BaseReadEngine
           takenMask(0),
           allocated(false),
           valid(false),
-          hasConflict(false)
+          hasConflict(false),
+          hasChange(false)
         {}
     };
 
@@ -73,13 +75,7 @@ class CoalesceEngine : public BaseReadEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
-    int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
-
-    std::deque<Addr> addrResponseQueue;
-    std::deque<WorkListItem> worklistResponseQueue;
+    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     std::deque<int> evictQueue;
 
@@ -88,9 +84,6 @@ class CoalesceEngine : public BaseReadEngine
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
@@ -115,6 +108,7 @@ class CoalesceEngine : public BaseReadEngine
     CoalesceStats stats;
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9ed781d79..86418ac76e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -39,10 +39,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    memRespQueueSize(params.mem_resp_queue_size),
-    onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
@@ -66,12 +63,13 @@ PushEngine::startup()
     *tempPtr = 0;
 
     // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
-    sendPushUpdate(first_update);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
 }
 
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -108,19 +106,21 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    assert(pushReqQueue.size() <= pushReqQueueSize);
+    assert((pushReqQueueSize == 0) ||
+        (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t update_value = wl.prop;
-    pushReqQueue.push_back(
-        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
+    uint32_t value = wl.prop;
 
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!pushReqQueue.empty())) {
+    // TODO: parameterize 64 to memory atom size
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+
+    assert(!pushReqQueue.empty());
+    if (!nextAddrGenEvent.scheduled()) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -129,65 +129,44 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    Addr start_addr, end_addr;
-    uint32_t update_value;
 
-    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
-    std::tie(start_addr, end_addr) = front.first;
-    update_value = front.second;
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    Addr req_addr = (start_addr / 64) * 64;
-    Addr req_offset = start_addr % 64;
-    int num_edges = 0;
+    PushPacketInfoGen curr_info = pushReqQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
 
-    if (end_addr > req_addr + 64) {
-        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
-    } else {
-        num_edges = (end_addr - start_addr) / sizeof(Edge);
-    }
-    PacketPtr pkt = createReadPacket(req_addr, 64);
-    reqOffsetMap[pkt->req] = req_offset;
+    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = update_value;
-    pendingReadReqs.push_back(pkt);
+    reqValueMap[pkt->req] = curr_info.value();
 
-    pushReqQueue.pop_front();
+    enqueueMemReq(pkt);
 
-    if (req_addr + 64 < end_addr) {
-        pushReqQueue.push_front(
-        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
-        );
+    if (curr_info.done()) {
+        pushReqQueue.pop_front();
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
+        requestAlarm(1);
+        return;
     }
 
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextReadEvent()
+PushEngine::respondToAlarm()
 {
-    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
-        (!memPortBlocked())) {
-        PacketPtr pkt = pendingReadReqs.front();
-        sendMemReq(pkt);
-        onTheFlyReadReqs++;
-        pendingReadReqs.pop_front();
-    }
-
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
-    }
+    assert(!nextAddrGenEvent.scheduled());
+    schedule(nextAddrGenEvent, nextCycle());
 }
 
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
-    onTheFlyReadReqs--;
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
@@ -201,39 +180,42 @@ void
 PushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-    Addr offset = reqOffsetMap[req];
-    uint32_t value = reqValueMap[req];
+    Addr offset = reqOffsetMap[pkt->req];
+    assert(offset < 64);
+    uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
-    Edge* e = (Edge*) (data + offset);
-    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    Edge* curr_edge = (Edge*) (data + offset);
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, update_value);
+            __func__, curr_edge->neighbor, update_value);
 
-    PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), update_value);
+    PacketPtr update = createUpdatePacket<uint32_t>(
+                            curr_edge->neighbor, update_value);
 
-    if (sendPushUpdate(update)) {
+    if (!reqPort.blocked()) {
         DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                    __func__, e->neighbor, update_value);
-        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
-        reqNumEdgeMap[req]--;
-    }
-
-    if (reqNumEdgeMap[req] == 0) {
+                                __func__, curr_edge->neighbor, update_value);
+        reqPort.sendPacket(update);
+        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
+        assert(reqOffsetMap[pkt->req] <= 64);
+        reqNumEdgeMap[pkt->req]--;
+        assert(reqNumEdgeMap[pkt->req] >= 0);
+    }
+
+    if (reqNumEdgeMap[pkt->req] == 0) {
+        reqOffsetMap.erase(pkt->req);
+        reqNumEdgeMap.erase(pkt->req);
+        reqValueMap.erase(pkt->req);
+        delete pkt;
         memRespQueue.pop_front();
-        reqOffsetMap.erase(req);
-        reqNumEdgeMap.erase(req);
-        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -241,11 +223,11 @@ PushEngine::processNextPushEvent()
     }
 }
 
-PacketPtr
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) _requestorId) << 2);
@@ -255,19 +237,9 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 
     pkt->allocate();
     // pkt->setData(data);
-    pkt->setLE<uint32_t>(value);
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a539079ede..2aba0ca008 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -39,6 +39,42 @@ namespace gem5
 class PushEngine : public BaseReadEngine
 {
   private:
+    class PushPacketInfoGen {
+      private:
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+        uint32_t _value;
+
+      public:
+        PushPacketInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _value(value)
+        {}
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (_start + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+            _start = aligned_addr + _atom;
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        uint32_t value() { return _value; }
+        bool done() { return (_start >= _end); }
+    };
+
     class ReqPort : public RequestPort
     {
       private:
@@ -64,37 +100,30 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
+    std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // TODO: Possibility of infinite queueing
-    std::deque<PacketPtr> pendingReadReqs;
-
-    int memRespQueueSize;
-    int onTheFlyReadReqs;
+    // Since the push engine can process incoming packets faster than
+    // memory can send those packets, the size of this queue will
+    // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
-
-    bool sendPushUpdate(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ea45cae652..cca945ce0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -118,9 +118,10 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::processNextReadEvent()
 {
-    PacketPtr update = updateQueue.front();
-    Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getLE<uint32_t>();
+    Addr update_addr;
+    uint32_t update_value;
+    std::tie(update_addr, update_value) = updateQueue.front();
+
     DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
@@ -229,10 +230,11 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push_back(pkt);
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
+    delete pkt;
     assert(!updateQueue.empty());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 476c9be932..12df93ee79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,7 +71,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::deque<PacketPtr> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From e1a428134c05fe3949b06b780cd143732a583e3a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Apr 2022 13:13:37 -0700
Subject: [PATCH 084/279] Fixing one scheduling error in events.

---
 configs/accl/sega.py                    |  7 +++----
 src/accl/graph/base/base_read_engine.cc | 12 ++++++++++++
 src/accl/graph/base/base_read_engine.hh |  2 ++
 src/accl/graph/sega/coalesce_engine.cc  |  8 ++++----
 src/accl/graph/sega/push_engine.cc      | 23 +++++++++++++++++------
 src/accl/graph/sega/wl_engine.cc        |  4 ++--
 6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e24280366..e45580dd37 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,13 +5,12 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
-                                    mem_resp_queue_size=8)
+                                    push_req_queue_size=16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size=16,
-                                    on_the_fly_update_map_size=8)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index e3b588cfc6..1658d85627 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -86,6 +86,8 @@ BaseReadEngine::MemPort::recvReqRetry()
     if (!blocked()) {
         blockedPacket = nullptr;
     }
+
+    owner->wakeUp();
 }
 
 void
@@ -177,8 +179,18 @@ BaseReadEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
+    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
     alarmRequested = true;
     spaceRequested = space;
 }
 
+void
+BaseReadEngine::wakeUp()
+{
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index bec922beef..5275f86449 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -108,6 +108,8 @@ class BaseReadEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt);
 
+    void wakeUp();
+
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 015629245b..c740597a2c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -202,8 +202,8 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
-                "worklistResponseQueue.size = %d.\n", __func__,
+    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -338,7 +338,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -355,7 +355,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty()) &&
-        (pendingAlarm())) {
+        (!pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 86418ac76e..3c1a98c69a 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -120,7 +120,8 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
 
     assert(!pushReqQueue.empty());
-    if (!nextAddrGenEvent.scheduled()) {
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -133,8 +134,11 @@ PushEngine::processNextAddrGenEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    PushPacketInfoGen curr_info = pushReqQueue.front();
+    PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    DPRINTF(MPU, "%s: Current packet information generated by "
+                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
     PacketPtr pkt = createReadPacket(aligned_addr, 64);
     reqOffsetMap[pkt->req] = offset;
@@ -144,11 +148,17 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
+        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+                    "pushReqQueue.size() = %u.\n",
+                    __func__, pushReqQueue.size());
     }
 
-    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
-        requestAlarm(1);
+    if (memReqQueueFull()) {
+        if (!pushReqQueue.empty()) {
+            requestAlarm(1);
+        }
         return;
     }
 
@@ -162,6 +172,7 @@ PushEngine::respondToAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
+    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool
@@ -201,9 +212,9 @@ PushEngine::processNextPushEvent()
                             curr_edge->neighbor, update_value);
 
     if (!reqPort.blocked()) {
-        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
         reqPort.sendPacket(update);
+        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+                                __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= 64);
         reqNumEdgeMap[pkt->req]--;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index cca945ce0a..ad9e93ba60 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -146,7 +146,7 @@ WLEngine::processNextReadEvent()
     } else {
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
@@ -231,7 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From 7a9be5b96b5b59f1e9065cf602ba301f91f211be Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 10 Apr 2022 16:42:27 -0700
Subject: [PATCH 085/279] Works!!!!!!

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/TODO.md             | 6 ++++++
 src/accl/graph/sega/push_engine.cc | 8 ++++++++
 src/accl/graph/sega/push_engine.hh | 2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e45580dd37..e68097ce74 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,7 +11,7 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
@@ -40,7 +40,7 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
         self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index a0e2cefeff..f6d77d5e22 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -7,3 +7,9 @@ and memory atom size in the coalesce engine
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+
+
+Advice from Jason:
+* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
+* if it
+* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 3c1a98c69a..1fced87a43 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -106,6 +106,14 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
+    // If there are no outdoing edges, no need to generate and push
+    // updates. Therefore, we only need to return true.
+    if (wl.degree == 0) {
+        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        return true;
+    }
+
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2aba0ca008..29d18709ee 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -61,7 +61,7 @@ class PushEngine : public BaseReadEngine
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 
-            if (_end > (_start + _atom)) {
+            if (_end > (aligned_addr + _atom)) {
                 num_items = (_atom - offset) / _step;
             } else {
                 num_items = (_end - _start) / _step;

From 4e79e4192f698f449216b319e120cd121ec5146b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 09:59:31 -0700
Subject: [PATCH 086/279] Removing SystemXBar from config script. [has-bug]

---
 configs/accl/sega.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e68097ce74..dd7623bfea 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,10 +11,6 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
-
-        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
-        self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -26,10 +22,15 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.req_port = port
 
-    def getMemPort(self):
-        return self.interconnect.mem_side_ports
-    def setMemPort(self, port):
-        self.interconnect.mem_side_ports = port
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
@@ -40,15 +41,16 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
-        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
-        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
+    def getVertexPort(self):
+        return self.vertex_mem_ctrl.port
+    def setVertexPort(self, port):
+        self.vertex_mem_ctrl.port = port
 
-    def getPort(self):
-        return self.interconnect.cpu_side_ports
-    def setPort(self, port):
-        self.interconnect.cpu_side_ports = port
+    def getEdgePort(self):
+        return self.edge_mem_ctrl.port
+    def setEdgePort(self, port):
+        self.edge_mem_ctrl.port = port
 
 class SEGA(System):
     def __init__(self):
@@ -65,7 +67,8 @@ def __init__(self):
             edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.getPort())
+        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
+        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)

From 75c36e64529f6d7ef0746c5e9770082fec79ad66 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 19:59:24 -0700
Subject: [PATCH 087/279] Fixing the bug when deallocating a taken line.

---
 configs/accl/sega.py                    |   4 +-
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |   1 +
 src/accl/graph/base/base_read_engine.hh |   2 +
 src/accl/graph/sega/CoalesceEngine.py   |   2 +
 src/accl/graph/sega/coalesce_engine.cc  | 388 +++++++++++++-----------
 6 files changed, 222 insertions(+), 178 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index dd7623bfea..7f4663cc82 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -36,10 +36,10 @@ class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         super(MPUMemory, self).__init__()
         self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="25GB/s",
+            range=vertex_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=vertex_binary)
         self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="25GB/s",
+            range=edge_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=edge_binary)
 
     def getVertexPort(self):
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 3ddab2d3c4..d4ab622fd6 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -40,3 +40,6 @@ class BaseReadEngine(ClockedObject):
 
     outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
                                     "which memory requests are queued.")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 1658d85627..19214a3bd1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,6 +36,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 5275f86449..0cab95dbbb 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,6 +68,8 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int peerMemoryAtomSize;
+
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index bec7e3d233..3e5699f552 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseReadEngine):
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index c740597a2c..41d1fe4953 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -88,7 +88,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
+        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,7 +148,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
+                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
+                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -256,7 +264,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
 
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / 16;
@@ -269,7 +277,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -336,7 +348,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
+    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -373,189 +389,209 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
-
-    if ((cacheBlocks[block_index].hasChange)&&
-        (cacheBlocks[block_index].hasConflict) &&
-        (memReqQueueHasSpace(2))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict)) {
-        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                    __func__, block_index);
-    } else {
-        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-        requestAlarm(spaceNeeded);
-        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-        __func__, spaceNeeded);
-        return;
-    }
-
-    // Reducing between tempProp and prop for each item in the cache line.
-    for (int i = 0; i < 4; i++) {
-        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-        cacheBlocks[block_index].items[i].prop = std::min(
-            cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].tempProp);
-        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                    block_index, i,
-                    cacheBlocks[block_index].items[i].to_string());
-        if (old_prop != cacheBlocks[block_index].items[i].prop) {
-            changedMask |= (1 << i);
-            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                        __func__, block_index, i);
+    if (cacheBlocks[block_index].takenMask == 0) {
+        if ((cacheBlocks[block_index].hasChange)&&
+            (cacheBlocks[block_index].hasConflict) &&
+            (memReqQueueHasSpace(2))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict)) {
+            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                        __func__, block_index);
+        } else {
+            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+            requestAlarm(spaceNeeded);
+            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+            __func__, spaceNeeded);
+            return;
         }
-    }
 
-    if (cacheBlocks[block_index].hasChange) {
-        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                    , __func__, block_index);
-        // TODO: Parameterize this 64 to memory atom size
-        PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64,
-            (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                    __func__, write_pkt->getAddr());
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
+        // Reducing between tempProp and prop for each item in the cache line.
+        for (int i = 0; i < 4; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                cacheBlocks[block_index].items[i].prop,
+                cacheBlocks[block_index].items[i].tempProp);
+            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                        block_index, i,
+                        cacheBlocks[block_index].items[i].to_string());
+            if (old_prop != cacheBlocks[block_index].items[i].prop) {
+                changedMask |= (1 << i);
+                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                            __func__, block_index, i);
             }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+        }
+
+        if (cacheBlocks[block_index].hasChange) {
+            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                        , __func__, block_index);
+            // TODO: Parameterize this 64 to memory atom size
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, 64,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                        __func__, write_pkt->getAddr());
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                enqueueMemReq(read_pkt);
+                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                            "its subsequent read packet (to service the conflicts)"
+                            " to outstandingMemReqQueue.\n" , __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+                // TODO: This should be improved
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write back"
+                        " packet.\n", __func__, block_index);
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                DPRINTF(MPU, "%s: Added the write back packet to "
+                            "outstandingMemReqQueue.\n", __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+
+                // Since allocated is false, does not matter what the address is.
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
             }
-            // TODO: This should be improved
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
-            }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                        "backs are necessary.\n", __func__, block_index);
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                            "deallocating the line.\n", __func__, block_index);
+
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
             }
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary.\n",
-                    __func__, block_index, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-            enqueueMemReq(read_pkt);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                        "deallocating the line.\n", __func__, block_index);
-
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
+                    "for eviction. Therefore, ignoring the evict schedule.\n",
+                    __func__, block_index);
     }
 
     evictQueue.pop_front();

From 3fba566556d4f9f93e2c2ce5ad227a30a1b85324 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 09:46:44 -0700
Subject: [PATCH 088/279] Parameterizing cache_size and memory_atom_size.

---
 src/accl/graph/TODO.md                  |  12 ---
 src/accl/graph/base/base_read_engine.cc |   7 +-
 src/accl/graph/base/base_read_engine.hh |   4 +-
 src/accl/graph/sega/CoalesceEngine.py   |   5 +-
 src/accl/graph/sega/coalesce_engine.cc  | 127 +++++++++---------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +--
 src/accl/graph/sega/push_engine.cc      |   9 +-
 7 files changed, 74 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f6d77d5e22..1cec4dc6f9 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,15 +1,3 @@
 # TODO Items
-
-* use setLE/setBE inside createUpdatePacket and createWritePacket
-* parameterize cache size, associativity, maybe latencies,
-and memory atom size in the coalesce engine
-* look at all the simobjects and come up with a general architecture. Make
-sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
-
-
-Advice from Jason:
-* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
-* if it
-* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 19214a3bd1..714a4542f1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,12 +36,12 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    _requestorId(system->getRequestorId(this))
+    _requestorId(system->getRequestorId(this)),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
 BaseReadEngine::~BaseReadEngine()
@@ -101,6 +101,9 @@ BaseReadEngine::processNextMemReqEvent()
     // TODO: Maybe add a DPRINTF here.
     PacketPtr pkt = outstandingMemReqQueue.front();
     memPort.sendPacket(pkt);
+    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                "pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
     if (alarmRequested &&
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 0cab95dbbb..f11459ad6e 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,8 +68,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int peerMemoryAtomSize;
-
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
@@ -81,6 +79,8 @@ class BaseReadEngine : public ClockedObject
   protected:
     const RequestorID _requestorId;
 
+    size_t peerMemoryAtomSize;
+
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     bool memReqQueueHasSpace(int space);
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 3e5699f552..faa5295ed7 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,10 @@ class CoalesceEngine(BaseReadEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     peer_push_engine = Param.PushEngine(NULL, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 41d1fe4953..4d152e375d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,21 +38,17 @@ namespace gem5
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
     peerPushEngine(params.peer_push_engine),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
-{}
-
-void
-CoalesceEngine::startup()
 {
-    for (int i = 0; i < 256; i++) {
-        cacheBlocks[i].takenMask = 0;
-        cacheBlocks[i].allocated = false;
-        cacheBlocks[i].valid = false;
-        cacheBlocks[i].hasConflict = false;
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
     }
 }
 
@@ -74,8 +70,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr aligned_addr = (addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
+    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
@@ -162,11 +158,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
-                    // TODO: Parameterize 64 to memory atom size
-                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+
+                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = 64.\n",
-                                __func__, addr, aligned_addr);
+                                " req addr (aligned_addr) = %lu, size = %d.\n",
+                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
@@ -240,10 +236,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    // TODO: After parameterizing the cache size
-    // this 256 number should change to the cache
-    // size parameter.
-    int block_index = (addr / 64) % 256;
+
+    int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -264,10 +258,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
 
         if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
@@ -334,9 +328,9 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
-    int wl_offset = (addr - aligned_addr) / 16;
+    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
@@ -437,12 +431,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (cacheBlocks[block_index].hasChange) {
             DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                         , __func__, block_index);
-            // TODO: Parameterize this 64 to memory atom size
+
             PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, 64,
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                        __func__, write_pkt->getAddr());
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
+                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
             if (cacheBlocks[block_index].hasConflict) {
                 DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                             "enough space in outstandingMemReqQueue for the write "
@@ -451,12 +445,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                    aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
-                            __func__, miss_addr, aligned_miss_addr);
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
                 stats.numVertexBlockWrites++;
@@ -465,28 +462,13 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             "its subsequent read packet (to service the conflicts)"
                             " to outstandingMemReqQueue.\n" , __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
-                // TODO: This should be improved
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
                 DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
@@ -509,26 +491,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
 
                 // Since allocated is false, does not matter what the address is.
@@ -555,11 +523,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                        aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr, aligned_miss_addr);
                 enqueueMemReq(read_pkt);
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6a8aadcbae..0ddbdfdeb1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,7 @@ class CoalesceEngine : public BaseReadEngine
   private:
     struct Block
     {
-        WorkListItem items[4];
+        WorkListItem* items;
         Addr addr;
         uint8_t takenMask;
         bool allocated;
@@ -56,20 +56,26 @@ class CoalesceEngine : public BaseReadEngine
         bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
-        Block():
+        Block() {}
+        Block(int num_elements):
           addr(0),
           takenMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
           hasChange(false)
-        {}
+        {
+          items = new WorkListItem [num_elements];
+        }
     };
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block cacheBlocks[256];
+    Block* cacheBlocks;
+
+    int numLines;
+    int numElementsPerLine;
 
     int numMSHREntry;
     int numTgtsPerMSHR;
@@ -79,8 +85,6 @@ class CoalesceEngine : public BaseReadEngine
 
     std::deque<int> evictQueue;
 
-    virtual void startup();
-
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 1fced87a43..8dcbac0dcc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -124,8 +124,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    // TODO: parameterize 64 to memory atom size
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -148,7 +147,7 @@ PushEngine::processNextAddrGenEvent()
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
-    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
     reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
     reqValueMap[pkt->req] = curr_info.value();
@@ -202,7 +201,7 @@ PushEngine::processNextPushEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < 64);
+    assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
@@ -224,7 +223,7 @@ PushEngine::processNextPushEvent()
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= 64);
+        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
         reqNumEdgeMap[pkt->req]--;
         assert(reqNumEdgeMap[pkt->req] >= 0);
     }

From 8c3e777bc033a2a5139267f7685bbf1d931346ca Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:21:28 -0700
Subject: [PATCH 089/279] Renaming BaseReadEngine to BaseMemEngine.

---
 configs/accl/sega.py                          |  6 ++-
 .../{BaseReadEngine.py => BaseMemEngine.py}   |  8 ++--
 src/accl/graph/base/SConscript                |  4 +-
 ...base_read_engine.cc => base_mem_engine.cc} | 30 +++++++-------
 ...base_read_engine.hh => base_mem_engine.hh} | 20 +++++-----
 src/accl/graph/base/data_structs.hh           |  6 +--
 src/accl/graph/sega/CoalesceEngine.py         |  4 +-
 src/accl/graph/sega/PushEngine.py             |  4 +-
 src/accl/graph/sega/coalesce_engine.cc        | 39 ++-----------------
 src/accl/graph/sega/coalesce_engine.hh        |  4 +-
 src/accl/graph/sega/push_engine.cc            |  4 +-
 src/accl/graph/sega/push_engine.hh            |  4 +-
 12 files changed, 52 insertions(+), 81 deletions(-)
 rename src/accl/graph/base/{BaseReadEngine.py => BaseMemEngine.py} (92%)
 rename src/accl/graph/base/{base_read_engine.cc => base_mem_engine.cc} (87%)
 rename src/accl/graph/base/{base_read_engine.hh => base_mem_engine.hh} (88%)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7f4663cc82..7d8b96490d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,9 +5,11 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16)
+                                    push_req_queue_size=16,
+                                    attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine)
+                                    peer_push_engine=self.push_engine,
+                                    attached_memory_atom_size=64)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseMemEngine.py
similarity index 92%
rename from src/accl/graph/base/BaseReadEngine.py
rename to src/accl/graph/base/BaseMemEngine.py
index d4ab622fd6..69f68e9dfc 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -29,11 +29,11 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class BaseReadEngine(ClockedObject):
+class BaseMemEngine(ClockedObject):
     abstract = True
-    type = 'BaseReadEngine'
-    cxx_header = "accl/graph/base/base_read_engine.hh"
-    cxx_class = 'gem5::BaseReadEngine'
+    type = 'BaseMemEngine'
+    cxx_header = "accl/graph/base/base_mem_engine.hh"
+    cxx_class = 'gem5::BaseMemEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index ea96f4323b..4c90dfa9a6 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,8 +27,8 @@
 
 Import('*')
 
-SimObject('BaseReadEngine.py')
+SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_read_engine.cc')
+Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_mem_engine.cc
similarity index 87%
rename from src/accl/graph/base/base_read_engine.cc
rename to src/accl/graph/base/base_mem_engine.cc
index 714a4542f1..50e64ae7c3 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
 namespace gem5
 {
 
-BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
@@ -44,11 +44,11 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
-BaseReadEngine::~BaseReadEngine()
+BaseMemEngine::~BaseMemEngine()
 {}
 
 Port&
-BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+BaseMemEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "mem_port") {
         return memPort;
@@ -58,7 +58,7 @@ BaseReadEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -70,14 +70,14 @@ BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
     return owner->handleMemResp(pkt);
 }
 
 void
-BaseReadEngine::MemPort::recvReqRetry()
+BaseMemEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -92,7 +92,7 @@ BaseReadEngine::MemPort::recvReqRetry()
 }
 
 void
-BaseReadEngine::processNextMemReqEvent()
+BaseMemEngine::processNextMemReqEvent()
 {
     if (memPort.blocked()) {
         return;
@@ -120,7 +120,7 @@ BaseReadEngine::processNextMemReqEvent()
 }
 
 PacketPtr
-BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -135,7 +135,7 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 }
 
 PacketPtr
-BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
 
@@ -151,7 +151,7 @@ BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseReadEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::memReqQueueHasSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -160,14 +160,14 @@ BaseReadEngine::memReqQueueHasSpace(int space)
 }
 
 bool
-BaseReadEngine::memReqQueueFull()
+BaseMemEngine::memReqQueueFull()
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
 }
 
 void
-BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
@@ -179,7 +179,7 @@ BaseReadEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseReadEngine::requestAlarm(int space) {
+BaseMemEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
@@ -189,7 +189,7 @@ BaseReadEngine::requestAlarm(int space) {
 }
 
 void
-BaseReadEngine::wakeUp()
+BaseMemEngine::wakeUp()
 {
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_mem_engine.hh
similarity index 88%
rename from src/accl/graph/base/base_read_engine.hh
rename to src/accl/graph/base/base_mem_engine.hh
index f11459ad6e..fb7cab91b0 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -26,33 +26,33 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
 
 #include <unordered_map>
 
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseReadEngine.hh"
+#include "params/BaseMemEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseReadEngine : public ClockedObject
+class BaseMemEngine : public ClockedObject
 {
   private:
     class MemPort : public RequestPort
     {
       private:
-        BaseReadEngine* owner;
+        BaseMemEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseReadEngine* owner):
+        MemPort(const std::string& name, BaseMemEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -96,10 +96,10 @@ class BaseReadEngine : public ClockedObject
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
-    PARAMS(BaseReadEngine);
+    PARAMS(BaseMemEngine);
 
-    BaseReadEngine(const BaseReadEngineParams &params);
-    ~BaseReadEngine();
+    BaseMemEngine(const BaseMemEngineParams &params);
+    ~BaseMemEngine();
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
@@ -116,4 +116,4 @@ class BaseReadEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 28a503528f..409245eeaa 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
-#define __ACCL_GRAPH_BASE_UTIL_HH__
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
 
@@ -83,4 +83,4 @@ struct __attribute__ ((packed)) Edge
 
 }
 
-#endif // __ACCL_GRAPH_BASE_UTIL_HH__
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index faa5295ed7..086f284950 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class CoalesceEngine(BaseReadEngine):
+class CoalesceEngine(BaseMemEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 645bc5f4ea..d3276799aa 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class PushEngine(BaseReadEngine):
+class PushEngine(BaseMemEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d152e375d..1c5dee8b8f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -36,7 +36,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -83,12 +83,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-        // TODO: Use a bitset instead of unsigned int for takenMask
-        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
+
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,11 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
-                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -271,11 +263,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
-            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -342,11 +330,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -413,7 +397,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         }
 
         // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             cacheBlocks[block_index].items[i].prop = std::min(
                 cacheBlocks[block_index].items[i].prop,
@@ -471,11 +455,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -500,11 +480,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 // Since allocated is false, does not matter what the address is.
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
+
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
@@ -535,11 +512,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -548,11 +521,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
                             "deallocating the line.\n", __func__, block_index);
 
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0ddbdfdeb1..4c4cb4567b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -42,7 +42,7 @@ namespace gem5
 
 class WLEngine;
 
-class CoalesceEngine : public BaseReadEngine
+class CoalesceEngine : public BaseMemEngine
 {
   private:
     struct Block
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8dcbac0dcc..53cb428b12 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,7 +35,7 @@ namespace gem5
 {
 
 PushEngine::PushEngine(const PushEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -49,7 +49,7 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "req_port") {
         return reqPort;
     } else if (if_name == "mem_port") {
-        return BaseReadEngine::getPort(if_name, idx);
+        return BaseMemEngine::getPort(if_name, idx);
     } else {
         return SimObject::getPort(if_name, idx);
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 29d18709ee..5e8b079d88 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,14 +29,14 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class PushEngine : public BaseReadEngine
+class PushEngine : public BaseMemEngine
 {
   private:
     class PushPacketInfoGen {

From 08dc5d05f96ae79653907f84668e40a185e84e16 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:30:08 -0700
Subject: [PATCH 090/279] Adding a new SConscript for src/accl.

---
 configs/accl/sega.py           |  4 ++--
 src/accl/graph/SConscript      | 30 ++++++++++++++++++++++++++++++
 src/accl/graph/sega/SConscript |  2 +-
 3 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/graph/SConscript

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7d8b96490d..4168217f4d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="facebook/graph_binaries/vertices",
+            vertex_binary="graphs/facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
new file mode 100644
index 0000000000..00fa2466dd
--- /dev/null
+++ b/src/accl/graph/SConscript
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+DebugFlag('MPU')
\ No newline at end of file
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 9b4629838b..6e563b2677 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('MPU')
+DebugFlag('WLWrites')

From be06d128770ccc2f76c59fd8dccb89ed6ff356b6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 14:11:32 -0700
Subject: [PATCH 091/279] Fixing stats and adding a few new ones.

---
 configs/accl/sega.py                   |  4 +--
 src/accl/graph/sega/SConscript         |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 43 ++++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  4 +--
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 4168217f4d..0532aa2153 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/facebook/graph_binaries/vertices",
+            vertex_binary="graphs/epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 6e563b2677..19d702c49a 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('WLWrites')
+DebugFlag('ApplyUpdates')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c5dee8b8f..36a7ddb6d2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -83,16 +84,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-
         stats.readHits++;
-        stats.numVertexReads++;
 
         assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -105,6 +104,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                stats.readRejections++;
                 return false;
             } else {
                 DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
@@ -117,12 +117,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d]", __func__, addr, block_index);
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
@@ -137,6 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     if (memReqQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -158,7 +162,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
-                    stats.numVertexBlockReads++;
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 }
             }
@@ -169,6 +174,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
+                stats.readRejections++;
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
@@ -178,9 +184,17 @@ CoalesceEngine::recvReadAddr(Addr addr)
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
+
+            if (aligned_addr != cacheBlocks[block_index].addr) {
+                stats.readMisses++;
+            } else {
+                stats.readHits++;
+            }
+
             MSHRMap[block_index].push_back(addr);
             DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
             return true;
         }
     }
@@ -264,7 +278,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            stats.numVertexReads++;
+
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
@@ -334,7 +348,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
-
     // TODO: Make this more general and programmable.
     // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
     // to evictQueue.
@@ -440,7 +453,6 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 enqueueMemReq(read_pkt);
                 DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                             "its subsequent read packet (to service the conflicts)"
@@ -448,6 +460,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -467,12 +482,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         "enough space in outstandingMemReqQueue for the write back"
                         " packet.\n", __func__, block_index);
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -548,16 +565,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
 
-    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
-    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
-             "Number of memory blocks writes for vertecies"),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
              "Number of memory vertecies written to cache."),
     ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits.")
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readRejections, statistics::units::Count::get(),
+             "Number of cache rejections.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4c4cb4567b..efd19d3e9b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,11 +102,11 @@ class CoalesceEngine : public BaseMemEngine
 
       CoalesceEngine &coalesce;
 
-      statistics::Scalar numVertexBlockReads;
-      statistics::Scalar numVertexBlockWrites;
       statistics::Scalar numVertexReads;
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
+      statistics::Scalar readMisses;
+      statistics::Scalar readRejections;
     };
 
     CoalesceStats stats;

From 0b4177ed6f0a7ecb7ae39ebea144e6d667637c42 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 15 Apr 2022 15:21:34 -0700
Subject: [PATCH 092/279] Fixing memory atom size issue.

---
 configs/accl/sega.py                   | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0532aa2153..61df2cc2ef 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ def __init__(self, base_edge_addr):
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=64)
+                                    attached_memory_atom_size=32)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36a7ddb6d2..e54447fd09 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -251,7 +251,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < numElementsPerLine; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 53cb428b12..195cb65dbc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -185,6 +185,8 @@ PushEngine::respondToAlarm()
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {

From d951956ab9667b34203e9f02fc3667e0eb8060fb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Apr 2022 13:34:12 -0700
Subject: [PATCH 093/279] Removing dead code.

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/sega/push_engine.cc | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 61df2cc2ef..450f158f93 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices",
+            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
+            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 195cb65dbc..716daf92e8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -58,11 +58,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
     PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
     if (!reqPort.blocked()) {

From 96761b6653649259df40ae63176c74aaa85a8adc Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 19 Apr 2022 12:03:25 -0700
Subject: [PATCH 094/279] [WIP] added the central control unit.

It has error about the crossbar
---
 configs/accl/sega.py                       |  10 +-
 src/accl/graph/sega/CenteralController.py  |  39 +++++++
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/centeral_controller.cc | 123 +++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  84 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  10 --
 src/accl/graph/sega/push_engine.hh         |   2 -
 src/accl/graph/sega/wl_engine.cc           |   6 +
 src/accl/graph/sega/wl_engine.hh           |   2 +
 9 files changed, 263 insertions(+), 15 deletions(-)
 create mode 100644 src/accl/graph/sega/CenteralController.py
 create mode 100644 src/accl/graph/sega/centeral_controller.cc
 create mode 100644 src/accl/graph/sega/centeral_controller.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 450f158f93..c4288c92d3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,14 +61,18 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
+            vertex_binary="graphs/test/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
+            edge_binary="graphs/test/edgelist_0")
+        self.interconnect = SystemXBar()
 
-        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
+        self.mpu.setRespPort(self.interconnect.mem_side_ports)
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..7b00f8b12d
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class CenteralController(ClockedObject):
+    type = 'CenteralController'
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = 'gem5::CenteralController'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    addr = Param.Addr("")
+    value = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 19d702c49a..c8810bbdb2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,12 @@
 
 Import('*')
 
+SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..daa2d9b390
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include "mem/packet_access.hh"
+
+namespace gem5
+{
+
+CenteralController::CenteralController
+                    (const CenteralControllerParams &params):
+    ClockedObject(params),
+    reqPort(name() + ".req_port", this),
+    addr(params.addr),
+    value(params.value)
+{}
+
+Port&
+CenteralController::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+CenteralController::startup()
+{
+    PacketPtr first_update = 
+                createUpdatePacket<uint32_t>(addr, value);
+
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
+}
+
+template<typename T> PacketPtr
+CenteralController::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), addr, value);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) value) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+// AddrRangeList
+// CenteralController::ReqPort::getAddrRanges() const
+// {
+//     AddrRangeList ret;
+//     ret.clear();
+//     return ret;
+// }
+
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!_blocked) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..0e1bb6ac80
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "params/CenteralController.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class CenteralController : public ClockedObject
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        // virtual AddrRangeList getAddrRanges() const;
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+    Addr addr;
+    uint32_t value;
+
+    template<typename T> PacketPtr 
+                              createUpdatePacket(Addr addr, T value);
+
+    virtual void startup();
+
+  public:
+    PARAMS(CenteralController);
+    CenteralController(const CenteralControllerParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 716daf92e8..ddfc2edef8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -55,16 +55,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-PushEngine::startup()
-{
-    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
-    }
-}
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5e8b079d88..ce9045e91a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -112,8 +112,6 @@ class PushEngine : public BaseMemEngine
     // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
-    virtual void startup();
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ad9e93ba60..40fca42d26 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -58,6 +58,12 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+WLEngine::init()
+{
+    respPort.sendRangeChange();
+}
+
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 12df93ee79..2698ce3ea8 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -65,6 +65,8 @@ class WLEngine : public BaseReduceEngine
         virtual void recvRespRetry();
     };
 
+    virtual void init();
+
     RespPort respPort;
 
     bool blockedByCoalescer;

From 311062bb17651d762e683378ffcd5ce2d7514198 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 22 Apr 2022 11:44:24 -0700
Subject: [PATCH 095/279] Adding UpdateWL as a MemCmd and fixing code.

---
 configs/accl/sega.py                       |   5 +-
 src/accl/graph/TODO.md                     |   5 +
 src/accl/graph/base/data_structs.hh        |   3 +
 src/accl/graph/sega/centeral_controller.cc |  14 +-
 src/accl/graph/sega/coalesce_engine.cc     | 195 +++++++++------------
 src/accl/graph/sega/push_engine.cc         |   2 +-
 src/accl/graph/sega/wl_engine.cc           |  31 +---
 src/mem/packet.cc                          |  40 +----
 src/mem/packet.hh                          |   4 +-
 9 files changed, 105 insertions(+), 194 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c4288c92d3..aa3675d847 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -65,14 +65,15 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test/vertices_0",
+            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
         self.interconnect = SystemXBar()
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
         self.mpu.setReqPort(self.interconnect.cpu_side_ports)
         self.mpu.setRespPort(self.interconnect.mem_side_ports)
+
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 1cec4dc6f9..f5690a3faa 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,3 +1,8 @@
 # TODO Items
+
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+* add UpdateWL as a MemCmd
+* Replace std::floor with roundDown from intmath.hh in src
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 409245eeaa..7535d4bbac 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 
 namespace gem5
 {
@@ -81,6 +82,8 @@ struct __attribute__ ((packed)) Edge
     {}
 };
 
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index daa2d9b390..41ebeb9cd6 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -54,8 +54,7 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = 
-                createUpdatePacket<uint32_t>(addr, value);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -71,8 +70,7 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) value) << 2);
 
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
@@ -81,14 +79,6 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-// AddrRangeList
-// CenteralController::ReqPort::getAddrRanges() const
-// {
-//     AddrRangeList ret;
-//     ret.clear();
-//     return ret;
-// }
-
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e54447fd09..e6503ea01d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
@@ -47,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
 {
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
     cacheBlocks = new Block [numLines];
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
@@ -72,18 +74,25 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextRespondEvent for latency cycles in
+        // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
+        // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         stats.readHits++;
 
@@ -104,6 +113,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
@@ -200,6 +211,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
+// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
 {
@@ -241,8 +253,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
@@ -250,17 +260,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
-                                data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
     delete pkt;
 
-    int bias = 0;
+    // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
@@ -271,20 +281,26 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            // End of the said block
 
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
+
     // TODO: We Can use taken instead of this
+    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
         Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
@@ -298,8 +314,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
-        // TODO: I think this is unnecessary.
-        cacheBlocks[block_index].hasConflict = true;
+        assert(cacheBlocks[block_index].hasConflict);
     }
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -341,11 +356,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
         cacheBlocks[block_index].hasChange = true;
+        stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
     // TODO: Make this more general and programmable.
@@ -380,8 +395,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
+
     if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange)&&
+        if ((cacheBlocks[block_index].hasChange) &&
             (cacheBlocks[block_index].hasConflict) &&
             (memReqQueueHasSpace(2))) {
             DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
@@ -420,6 +436,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         cacheBlocks[block_index].items[i].to_string());
             if (old_prop != cacheBlocks[block_index].items[i].prop) {
                 changedMask |= (1 << i);
+                // TODO: Add a stat to count the number of changed props.
                 DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                             __func__, block_index, i);
             }
@@ -434,117 +451,65 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 (uint8_t*) cacheBlocks[block_index].items);
             DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
                         __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                    aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-
-                enqueueMemReq(write_pkt);
-                enqueueMemReq(read_pkt);
-                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                            "its subsequent read packet (to service the conflicts)"
-                            " to outstandingMemReqQueue.\n" , __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
-                }
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write back"
-                        " packet.\n", __func__, block_index);
-                enqueueMemReq(write_pkt);
-                DPRINTF(MPU, "%s: Added the write back packet to "
-                            "outstandingMemReqQueue.\n", __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
+            enqueueMemReq(write_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet to "
+                        "outstandingMemReqQueue.\n" , __func__);
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                if ((changedMask & (1 << i)) == (1 << i)) {
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
+                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                                        __func__, block_index, i);
                 }
-
-                // Since allocated is false, does not matter what the address is.
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
             }
-        } else {
-            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                        "backs are necessary.\n", __func__, block_index);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+        }
 
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(!MSHRMap[block_index].empty());
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for a read "
+                        "packet.\n", __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+
+            Addr aligned_miss_addr =
+                std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                        aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr);
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                            "deallocating the line.\n", __func__, block_index);
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-            }
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                    peerMemoryAtomSize);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = %d.\n",
+                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
+            enqueueMemReq(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue.\n" , __func__);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue.\n", __func__);
+
+            // Since allocated is false, does not matter what the address is.
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
         }
+
     } else {
         DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
                     "for eviction. Therefore, ignoring the evict schedule.\n",
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ddfc2edef8..e822b7168b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -238,7 +238,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     req->setPC(((Addr) _requestorId) << 2);
 
     // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40fca42d26..148f5de5be 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,8 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+// TODO: Parameterize the number of pops WLEngine can do at a time.
+// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
 WLEngine::processNextReadEvent()
 {
@@ -144,9 +146,7 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
-                if (updateQueue.size() == updateQueueSize - 1) {
-                    respPort.checkRetryReq();
-                }
+                respPort.checkRetryReq();
             }
         }
     } else {
@@ -164,9 +164,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
-        if (updateQueue.size() == updateQueueSize - 1) {
-            respPort.checkRetryReq();
-        }
+        respPort.checkRetryReq();
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -194,12 +192,9 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    std::unordered_map<Addr, WorkListItem>::iterator it =
-                    addrWorkListMap.begin();
-
-    std::vector<Addr> servicedAddresses;
-    while (it != addrWorkListMap.end()) {
-        Addr addr = it->first;
+    for (auto &it : addrWorkListMap) {
+        Addr addr = it.first;
+        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
@@ -214,17 +209,9 @@ WLEngine::processNextReduceEvent()
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        servicedAddresses.push_back(addr);
-        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
-                    __func__, addr);
-        it++;
-    }
-
-    addrWorkListMap.clear();
-    for (int i = 0; i < servicedAddresses.size(); i++) {
-        onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        onTheFlyUpdateMap.erase(addr);
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, servicedAddresses[i]);
+                    __func__, addr);
     }
 }
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index da45246e49..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
@@ -532,43 +533,4 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
-std::string
-Packet::printData()
-{
-    char ret[1024];
-    if (isWrite()) {
-        uint8_t* data = getPtr<uint8_t>();
-        std::sprintf(ret,"\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n",
-                        getAddr(),
-                        *((uint32_t*) data),
-                        *((uint32_t*) (data + 4)),
-                        *((uint32_t*) (data + 8)),
-                        *((uint32_t*) (data + 12)),
-                        getAddr() + 16,
-                        *((uint32_t*) (data + 16)),
-                        *((uint32_t*) (data + 20)),
-                        *((uint32_t*) (data + 24)),
-                        *((uint32_t*) (data + 28)),
-                        getAddr() + 32,
-                        *((uint32_t*) (data + 32)),
-                        *((uint32_t*) (data + 36)),
-                        *((uint32_t*) (data + 40)),
-                        *((uint32_t*) (data + 44)),
-                        getAddr() + 48,
-                        *((uint32_t*) (data + 48)),
-                        *((uint32_t*) (data + 52)),
-                        *((uint32_t*) (data + 56)),
-                        *((uint32_t*) (data + 60)));
-    }
-    return ret;
-}
-
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 26a7099d53..69686e7835 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -150,7 +150,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        // UpdateWL,
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
@@ -1387,8 +1387,6 @@ class Packet : public Printable, public Extensible<Packet>
     template <typename T>
     void setRaw(T v);
 
-    std::string printData();
-
   public:
     /**
      * Check a functional request against a memory value stored in

From 620f101a39ff790f6da7887e107a1e9357e42620 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Apr 2022 20:28:25 -0700
Subject: [PATCH 096/279] A little bit of debugging and updating config script.

---
 configs/accl/sega.py             | 138 +++++++++++++++++++++++--------
 src/accl/graph/TODO.md           |   5 +-
 src/accl/graph/sega/wl_engine.cc |   1 +
 src/accl/graph/sega/wl_engine.hh |   2 +-
 4 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index aa3675d847..9dd8c0f358 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,5 +1,9 @@
 import m5
+import argparse
+
+from math import log
 from m5.objects import *
+from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
@@ -35,53 +39,115 @@ def setEdgeMemPort(self, port):
         self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
-    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+    def __init__(self,
+                    num_channels: int,
+                    cache_line_size: int,
+                    vertex_memory_size: str,
+                    edge_memory_size: str,
+                    graph_path: str):
         super(MPUMemory, self).__init__()
-        self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=vertex_binary)
-        self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=edge_binary)
-
-    def getVertexPort(self):
-        return self.vertex_mem_ctrl.port
-    def setVertexPort(self, port):
-        self.vertex_mem_ctrl.port = port
-
-    def getEdgePort(self):
-        return self.edge_mem_ctrl.port
-    def setEdgePort(self, port):
-        self.edge_mem_ctrl.port = port
+
+        self._vertex_ranges = self._interleave_addresses(
+                                AddrRange(start=0, size=vertex_memory_size),\
+                                num_channels,\
+                                cache_line_size)
+
+        self._edge_chunk_size = int(\
+                                toMemorySize(edge_memory_size)/num_channels)
+        self._edge_ranges = [AddrRange(\
+                            start=toMemorySize(vertex_memory_size)+\
+                            self._edge_chunk_size*i,\
+                            size=self._edge_chunk_size)\
+                            for i in range(num_channels)]
+
+        vertex_mem_ctrl = []
+        edge_mem_ctrl = []
+        for i in range(num_channels):
+            vertex_mem_ctrl.append(
+                SimpleMemory(range=self._vertex_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/vertices_{i}")
+            )
+            edge_mem_ctrl.append(
+                SimpleMemory(range=self._edge_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/edgelist_{i}")
+            )
+        self.vertex_mem_ctrl = vertex_mem_ctrl
+        self.edge_mem_ctrl = edge_mem_ctrl
+
+    def _interleave_addresses(self,
+                            plain_range,
+                            num_channels,
+                            cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+    def getVertexPort(self, i):
+        return self.vertex_mem_ctrl[i].port
+    def setVertexPort(self, port, i):
+        self.vertex_mem_ctrl[i].port = port
+
+    def getEdgeBaseAddr(self, i):
+        return self._edge_ranges[i].start
+    def getEdgePort(self, i):
+        return self.edge_mem_ctrl[i].port
+    def setEdgePort(self, port, i):
+        self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self):
+    def __init__(self, num_mpus, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.ctrl = CenteralController(addr=0, value=0)
-        self.mpu = MPU(base_edge_addr=0x80000000)
-        self.mem_ctrl = MPUMemory(
-            vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
-            edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
-        self.interconnect = SystemXBar()
+        self.interconnect = NoncoherentXBar(frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64)
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.ctrl.req_port = self.interconnect.cpu_side_ports
-        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
-        self.mpu.setRespPort(self.interconnect.mem_side_ports)
 
-        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
-        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
+        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+
+        mpus = []
+        for i in range(num_mpus):
+            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
+            mpus[i].setRespPort(self.interconnect.mem_side_ports)
+            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
+            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
+        self.mpu = mpus
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("graph_path", type=str)
+    args = argparser.parse_args()
+    return args.num_mpus, args.graph_path
 
-system = SEGA()
-root = Root(full_system = False, system = system)
+if __name__ == "__m5_main__":
+    num_mpus, graph_path = get_inputs()
+    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
+    system = SEGA(num_mpus, graph_path)
+    root = Root(full_system = False, system = system)
 
-m5.instantiate()
+    m5.instantiate()
 
-exit_event = m5.simulate()
-print("Simulation finished!")
-exit()
+    exit_event = m5.simulate()
+    print("Simulation finished!")
+    exit()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f5690a3faa..29b5a2939e 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,8 +1,5 @@
 # TODO Items
 
-* implement all the communications between simobjects as req/retry.
-* get rid of maps with RequestPtr as keys
-* add UpdateWL as a MemCmd
 * Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
+    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 148f5de5be..e949cbcf5b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -213,6 +213,7 @@ WLEngine::processNextReduceEvent()
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
                     __func__, addr);
     }
+    addrWorkListMap.clear();
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2698ce3ea8..597fdb2b1e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -52,7 +52,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
         {}
         virtual AddrRangeList getAddrRanges() const;
 

From 6a9aa292e1f94b45c28bd529cfebcdb725835d25 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Apr 2022 16:56:04 -0700
Subject: [PATCH 097/279] Adding initState to CenteralController.

---
 configs/accl/sega.py                       | 23 ++++++++++------
 src/accl/graph/sega/CenteralController.py  |  3 ++
 src/accl/graph/sega/centeral_controller.cc | 32 ++++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  6 +++-
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9dd8c0f358..0907ba77de 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -66,8 +66,7 @@ def __init__(self,
             vertex_mem_ctrl.append(
                 SimpleMemory(range=self._vertex_ranges[i],
                             bandwidth="19.2GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/vertices_{i}")
+                            latency="30ns")
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
@@ -108,21 +107,28 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, graph_path):
+    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = vertex_cache_line_size
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0)
+        self.ctrl = CenteralController(addr=0, value=0,
+                                    image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+        self.mem_ctrl = MPUMemory(
+                            num_mpus,
+                            self.cache_line_size,
+                            "2GiB",
+                            "2GiB",
+                            graph_path)
 
         mpus = []
         for i in range(num_mpus):
@@ -136,14 +142,15 @@ def __init__(self, num_mpus, graph_path):
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
     args = argparser.parse_args()
-    return args.num_mpus, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
 
 if __name__ == "__m5_main__":
-    num_mpus, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7b00f8b12d..bd2f6320a8 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -34,6 +34,9 @@ class CenteralController(ClockedObject):
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
+    system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
     addr = Param.Addr("")
     value = Param.Int(0, "")
+
+    image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 41ebeb9cd6..3c05972224 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -36,6 +39,7 @@ namespace gem5
 CenteralController::CenteralController
                     (const CenteralControllerParams &params):
     ClockedObject(params),
+    system(params.system),
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
@@ -51,6 +55,26 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+CenteralController::initState()
+{
+    ClockedObject::initState();
+
+    const auto &file = params().image_file;
+    if (file == "")
+        return;
+
+    auto *object = loader::createObjectFile(file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage image = object->buildImage();
+    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
+                    system->cacheLineSize());
+
+    panic_if(!image.write(proxy), "%s: Unable to write image.");
+}
+
 void
 CenteralController::startup()
 {
@@ -110,4 +134,12 @@ CenteralController::ReqPort::recvReqRetry()
     }
 }
 
+void
+CenteralController::functionalAccess(PacketPtr pkt)
+{
+    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
+    reqPort.sendFunctional(pkt);
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 0e1bb6ac80..102800de92 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -61,16 +61,20 @@ class CenteralController : public ClockedObject
         virtual void recvReqRetry();
     };
 
+    System* system;
     ReqPort reqPort;
 
     Addr addr;
     uint32_t value;
 
-    template<typename T> PacketPtr 
+    template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
 
+    virtual void initState();
     virtual void startup();
 
+    void functionalAccess(PacketPtr pkt);
+
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);

From 2715f74dd21fa2d14cec7d1aef02e1a86fd9efd6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Apr 2022 15:28:06 -0700
Subject: [PATCH 098/279] Changing debug flag for CenteralController.

---
 src/accl/graph/sega/SConscript             | 1 +
 src/accl/graph/sega/centeral_controller.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index c8810bbdb2..16fab86ede 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -38,3 +38,4 @@ Source('push_engine.cc')
 Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
+DebugFlag('CenteralController')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 3c05972224..f19c93ebac 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -30,7 +30,7 @@
 
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
-#include "debug/MPU.hh"
+#include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -137,7 +137,8 @@ CenteralController::ReqPort::recvReqRetry()
 void
 CenteralController::functionalAccess(PacketPtr pkt)
 {
-    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+    DPRINTF(CenteralController,
+                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
                 __func__, pkt->getAddr(), pkt->getSize());
     reqPort.sendFunctional(pkt);
 }

From a6542d9e54ac5ab20134d0ab1627afc9215f6c5a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 3 May 2022 09:33:52 -0700
Subject: [PATCH 099/279] Fixing a bug and adding new stats.

---
 configs/accl/sega.py                   |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  4 +++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 19 ++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh     | 13 +++++++++++++
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0907ba77de..bfdad58f72 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,13 +9,15 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32)
+                                    attached_memory_atom_size=32,
+                                    cache_size="1MiB",
+                                    num_mshr_entry=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
+                                update_queue_size=32,
                                 on_the_fly_update_map_size=8)
 
     def getRespPort(self):
@@ -113,6 +115,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = vertex_cache_line_size
+        self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e6503ea01d..fbe593507a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -199,7 +199,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
             if (aligned_addr != cacheBlocks[block_index].addr) {
                 stats.readMisses++;
             } else {
-                stats.readHits++;
+                stats.readHitUnderMisses++;
             }
 
             MSHRMap[block_index].push_back(addr);
@@ -538,6 +538,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hits."),
     ADD_STAT(readMisses, statistics::units::Count::get(),
              "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections.")
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index efd19d3e9b..ce019ef969 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
+      statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
     };
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e822b7168b..69b9f3f23e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
+    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    stats(*this)
 {}
 
 Port&
@@ -207,6 +208,7 @@ PushEngine::processNextPushEvent()
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
+        stats.numUpdates++;
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
@@ -247,4 +249,19 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PushEngine::PushStats::PushStats(PushEngine &_push)
+    : statistics::Group(&_push),
+    push(_push),
+
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of sent updates.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce9045e91a..7a6981daa0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -120,6 +120,19 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine &push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numUpdates;
+    };
+
+    PushStats stats;
+
   protected:
     virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);

From 1d6d99b7962e705111fe49c837139152909d5dd4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 17 May 2022 10:56:09 -0700
Subject: [PATCH 100/279] Fixing double evicts.

---
 configs/accl/sega.py                   |  6 +++---
 src/accl/graph/sega/coalesce_engine.cc | 27 ++++++++++----------------
 src/accl/graph/sega/coalesce_engine.hh |  3 ---
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index bfdad58f72..b799b05dc5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=192, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -130,7 +130,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                             num_mpus,
                             self.cache_line_size,
                             "2GiB",
-                            "2GiB",
+                            "14GiB",
                             graph_path)
 
         mpus = []
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate()
+    exit_event = m5.simulate(1000000000000)
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fbe593507a..b41f6b1db7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -325,22 +325,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-PacketPtr
-CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
@@ -370,7 +354,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce019ef969..e86014fc25 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -85,9 +85,6 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<int> evictQueue;
 
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 

From 8aedc2d472de0c3f8874a90e250ec09518c5984c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 18 May 2022 17:23:05 -0700
Subject: [PATCH 101/279] Fixing false dependency and deadlock issues. wip.

---
 src/accl/graph/sega/coalesce_engine.cc | 74 +++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b41f6b1db7..92d82bce35 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -347,9 +347,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+
     // TODO: Make this more general and programmable.
-    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
-    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -359,6 +358,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         for (auto i : evictQueue) {
             if (i == block_index) {
                 found = true;
+                break;
             }
         }
         if (!found) {
@@ -376,6 +376,76 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
 }
 
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    int block_index = applyQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseApplySchedules++;
+    } else if (!cacheBlocks[block_index].hasChange) {
+        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+                    "needed. Adding the cache line to evict schedule.\n",
+                    __func__, block_index);
+        evictQueue.push_back(block_index);
+    } else {
+        for (int i = 0; i < numElementsPerLine; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            // TODO: Is this correct?
+            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
+
+            if (cacheBlocks[block_index].items[i].prop != old_prop) {
+                if (peerPushEngine->recvWLItem(
+                    cacheBlocks[block_index].items[i])) {
+                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+                } else {
+                    // peerPushEngine->setPushAlarm();
+                    // pendingPushAlarm = true;
+                    return;
+                }
+            }
+        }
+        // TODO: This is where eviction policy goes
+        evictQueue.push_back(block_index);
+    }
+
+    applyQueue.pop_front();
+
+    if ((!evictQueue.empty()) &&
+        (!pendingAlarm()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
+    }
+
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextEvictEvent()
+{
+    int block_index = evictQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseEvictSchedules++;
+    } else {
+        int space_needed = cacheBlocks
+    }
+}
+
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {

From 4143b56e4e5641832459e9fcf0e71a208ae16d18 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 12:10:10 -0700
Subject: [PATCH 102/279] Decoupling apply and evict. Done.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 214 ++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  11 +-
 3 files changed, 81 insertions(+), 146 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b799b05dc5..9d8b449e0f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=192, value=0,
+        self.ctrl = CenteralController(addr=0, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92d82bce35..f3402255bc 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,7 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
+    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -237,8 +238,8 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToAlarm()
 {
-    assert(!nextApplyAndCommitEvent.scheduled());
-    schedule(nextApplyAndCommitEvent, nextCycle());
+    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    schedule(nextEvictEvent, nextCycle());
 }
 
 bool
@@ -362,16 +363,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             }
         }
         if (!found) {
-            evictQueue.push_back(block_index);
+            applyQueue.push_back(block_index);
         }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty()) &&
-        (!pendingAlarm())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 
 }
@@ -442,150 +442,74 @@ CoalesceEngine::processNextEvictEvent()
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks
-    }
-}
-
-void
-CoalesceEngine::processNextApplyAndCommitEvent()
-{
-    // FIXME: Refactor the line below to work with the new inheritance.
-    // assert((!alarmRequested) && (spaceRequested == 0));
-    int block_index = evictQueue.front();
-    uint8_t changedMask = 0;
-
-    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
-                __func__, block_index);
-    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
-                "then commited.\n", __func__, block_index);
-
-    if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange) &&
-            (cacheBlocks[block_index].hasConflict) &&
-            (memReqQueueHasSpace(2))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict)) {
-            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                        __func__, block_index);
-        } else {
-            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-            requestAlarm(spaceNeeded);
-            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-            __func__, spaceNeeded);
+        int space_needed = cacheBlocks[block_index].hasChange ?
+                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
+                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
+        if (!memReqQueueHasSpace(space_needed)) {
+            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "hasConflict: %d.\n", __func__, block_index,
+                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].hasConflict);
+            requestAlarm(space_needed);
             return;
-        }
-
-        // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
-                cacheBlocks[block_index].items[i].prop,
-                cacheBlocks[block_index].items[i].tempProp);
-            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                        block_index, i,
-                        cacheBlocks[block_index].items[i].to_string());
-            if (old_prop != cacheBlocks[block_index].items[i].prop) {
-                changedMask |= (1 << i);
-                // TODO: Add a stat to count the number of changed props.
-                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                            __func__, block_index, i);
+        } else {
+            if (cacheBlocks[block_index].hasChange) {
+                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                            __func__, block_index);
+                PacketPtr write_pkt = createWritePacket(
+                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                    (uint8_t*) cacheBlocks[block_index].items);
+                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                            "size = %d.\n", __func__,
+                            write_pkt->getAddr(), write_pkt->getSize());
+                enqueueMemReq(write_pkt);
             }
-        }
 
-        if (cacheBlocks[block_index].hasChange) {
-            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                        , __func__, block_index);
-
-            PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
-                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            enqueueMemReq(write_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet to "
-                        "outstandingMemReqQueue.\n" , __func__);
-
-            for (int i = 0; i < numElementsPerLine; i++) {
-                if ((changedMask & (1 << i)) == (1 << i)) {
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
-                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                                        __func__, block_index, i);
-                }
-            }
-        }
+            if (cacheBlocks[block_index].hasConflict) {
+                assert(!MSHRMap[block_index].empty());
+                Addr miss_addr = MSHRMap[block_index].front();
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
-        if (cacheBlocks[block_index].hasConflict) {
-            assert(!MSHRMap[block_index].empty());
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for a read "
-                        "packet.\n", __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-            Addr aligned_miss_addr =
-                std::floor(miss_addr / peerMemoryAtomSize) *
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                    peerMemoryAtomSize);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = %d.\n",
-                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                        peerMemoryAtomSize);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr,
+                            read_pkt->getAddr(), read_pkt->getSize());
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                            __func__, block_index, aligned_miss_addr);
+            } else {
 
-    } else {
-        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
-                    "for eviction. Therefore, ignoring the evict schedule.\n",
-                    __func__, block_index);
+                // Since allocated is false, does not matter what the address is.
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                            __func__, block_index);
+            }
+        }
     }
 
     evictQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!evictQueue.empty()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
     }
 }
 
@@ -604,7 +528,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections.")
+             "Number of cache rejections."),
+    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
+             "Number of failed apply schedules."),
+    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
+             "Number of failed evict schedules.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e86014fc25..82b03f53aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -83,13 +83,18 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    std::deque<int> applyQueue;
+
     std::deque<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
-    EventFunctionWrapper nextApplyAndCommitEvent;
-    void processNextApplyAndCommitEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextEvictEvent;
+    void processNextEvictEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -105,6 +110,8 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
+      statistics::Scalar falseApplySchedules;
+      statistics::Scalar falseEvictSchedules;
     };
 
     CoalesceStats stats;

From 72e1339971f9bb5f1f94204d70a5e5e8829fb825 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 21:20:07 -0700
Subject: [PATCH 103/279] Fixed miss-deallocation bug. Hopefully.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  24 +++---
 src/accl/graph/base/base_mem_engine.hh |  17 ++--
 src/accl/graph/sega/coalesce_engine.cc | 107 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  13 +--
 src/accl/graph/sega/push_engine.cc     |  26 ++++--
 src/accl/graph/sega/push_engine.hh     |  11 ++-
 src/accl/graph/sega/wl_engine.cc       |   1 -
 src/accl/graph/sega/wl_engine.hh       |   1 -
 9 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9d8b449e0f..31b65ae726 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 50e64ae7c3..f02f1d2feb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -37,8 +37,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
+    memAlarmRequested(false),
+    memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
@@ -106,12 +106,12 @@ BaseMemEngine::processNextMemReqEvent()
                 __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
-    if (alarmRequested &&
+    if (memAlarmRequested &&
         (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        respondToAlarm();
+        (outstandingMemReqQueueSize - memSpaceRequested))) {
+        memAlarmRequested = false;
+        memSpaceRequested = 0;
+        respondToMemAlarm();
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -151,7 +151,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::allocateMemReqSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -179,13 +179,13 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestAlarm(int space) {
-    panic_if((alarmRequested == true) || (spaceRequested != 0),
+BaseMemEngine::requestMemAlarm(int space) {
+    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    alarmRequested = true;
-    spaceRequested = space;
+    memAlarmRequested = true;
+    memSpaceRequested = space;
 }
 
 void
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fb7cab91b0..8a18807e2e 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,8 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
+    bool memAlarmRequested;
+    int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     EventFunctionWrapper nextMemReqEvent;
@@ -81,15 +81,16 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-
-    bool memReqQueueHasSpace(int space);
+    bool allocateMemReqSpace(int space);
     bool memReqQueueFull();
+
+    bool pendingMemAlarm() { return memAlarmRequested; }
+    void requestMemAlarm(int space);
+
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
-    bool pendingAlarm() { return alarmRequested; }
-    void requestAlarm(int space);
 
-    virtual void respondToAlarm() = 0;
+    virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f3402255bc..36faff2c6a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,6 +44,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    pendingPushAlarm(false),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -54,6 +55,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    peerPushEngine->registerCoalesceEngine(this);
 }
 
 void
@@ -91,10 +93,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset, responseQueue.size(),
-            cacheBlocks[block_index].items[wl_offset].to_string());
+            __func__, addr, block_index, wl_offset,
+            cacheBlocks[block_index].items[wl_offset].to_string(),
+            responseQueue.size());
         // TODO: Add a stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
         assert(!responseQueue.empty());
@@ -156,7 +159,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -236,9 +239,9 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToAlarm()
+CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
     schedule(nextEvictEvent, nextCycle());
 }
 
@@ -290,7 +293,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
 
             servicedIndices.push_back(i);
@@ -336,27 +339,27 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].hasChange = true;
+        cacheBlocks[block_index].dirty = true;
         stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].takenMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
         bool found = false;
-        for (auto i : evictQueue) {
+        for (auto i : applyQueue) {
             if (i == block_index) {
                 found = true;
                 break;
@@ -364,12 +367,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
         if (!found) {
             applyQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                    __func__, block_index, applyQueue.size());
         }
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
+        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -381,16 +385,27 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    if (cacheBlocks[block_index].busyMask) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].hasChange) {
+    } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -407,20 +422,32 @@ CoalesceEngine::processNextApplyEvent()
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
                 } else {
-                    // peerPushEngine->setPushAlarm();
-                    // pendingPushAlarm = true;
+                    peerPushEngine->setPushAlarm();
+                    pendingPushAlarm = true;
                     return;
                 }
             }
         }
         // TODO: This is where eviction policy goes
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack.
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     }
 
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingAlarm()) &&
+        (!pendingMemAlarm()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -436,25 +463,33 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    bool found_in_apply_queue = false;
+    for (auto i : applyQueue) {
+        if (i == block_index) {
+            found_in_apply_queue = true;
+            break;
+        }
+    }
+    if ((cacheBlocks[block_index].busyMask) ||
+        (found_in_apply_queue)) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].hasChange ?
+        int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!memReqQueueHasSpace(space_needed)) {
+        if (!allocateMemReqSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestAlarm(space_needed);
+            requestMemAlarm(space_needed);
             return;
         } else {
-            if (cacheBlocks[block_index].hasChange) {
+            if (cacheBlocks[block_index].dirty) {
                 DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
@@ -484,21 +519,21 @@ CoalesceEngine::processNextEvictEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
                 // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
@@ -513,6 +548,14 @@ CoalesceEngine::processNextEvictEvent()
     }
 }
 
+void
+CoalesceEngine::respondToPushAlarm()
+{
+    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
+    pendingPushAlarm = false;
+    schedule(nextApplyEvent, nextCycle());
+}
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 82b03f53aa..824faef10d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,21 +49,21 @@ class CoalesceEngine : public BaseMemEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t takenMask;
+        uint8_t busyMask;
         bool allocated;
         bool valid;
         bool hasConflict;
-        bool hasChange;
+        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
-          takenMask(0),
+          busyMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
-          hasChange(false)
+          dirty(false)
         {
           items = new WorkListItem [num_elements];
         }
@@ -83,6 +83,7 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    bool pendingPushAlarm;
     std::deque<int> applyQueue;
 
     std::deque<int> evictQueue;
@@ -117,7 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -131,6 +132,8 @@ class CoalesceEngine : public BaseMemEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
+
+    void respondToPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 69b9f3f23e..d5563cca7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -36,6 +37,7 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
+    pushAlarmSet(false),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -56,6 +58,12 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+{
+    peerCoalesceEngine = coalesce_engine;
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -146,11 +154,15 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
+            pushAlarmSet = false;
+            peerCoalesceEngine->respondToPushAlarm();
+        }
     }
 
     if (memReqQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestAlarm(1);
+            requestMemAlarm(1);
         }
         return;
     }
@@ -161,7 +173,7 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToAlarm()
+PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
@@ -200,9 +212,6 @@ PushEngine::processNextPushEvent()
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
-    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, curr_edge->neighbor, update_value);
-
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge->neighbor, update_value);
 
@@ -249,6 +258,13 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+void
+PushEngine::setPushAlarm()
+{
+    assert(!pushAlarmSet);
+    pushAlarmSet = true;
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7a6981daa0..ce24f862ba 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -36,6 +36,8 @@
 namespace gem5
 {
 
+class CoalesceEngine;
+
 class PushEngine : public BaseMemEngine
 {
   private:
@@ -95,6 +97,9 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
+    bool pushAlarmSet;
+    CoalesceEngine* peerCoalesceEngine;
+
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
@@ -134,7 +139,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,6 +150,10 @@ class PushEngine : public BaseMemEngine
                 PortID idx=InvalidPortID) override;
 
     bool recvWLItem(WorkListItem wl);
+
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+
+    void setPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e949cbcf5b..75ac4f784e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -37,7 +37,6 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
-    blockedByCoalescer(false),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 597fdb2b1e..27fc3efa7a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -69,7 +69,6 @@ class WLEngine : public BaseReduceEngine
 
     RespPort respPort;
 
-    bool blockedByCoalescer;
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;

From 83cb67a910e9202f568e49b0f7807563363eae2b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 14:15:30 -0700
Subject: [PATCH 104/279] Correctness passed with finite push queue and
 facebook graph.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 20 +++++++++++++-------
 src/accl/graph/sega/push_engine.cc     | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh     |  3 ++-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 31b65ae726..8a6ac783c3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate(1000000000000)
+    exit_event = m5.simulate()
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36faff2c6a..39144972df 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -349,7 +349,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -409,15 +410,20 @@ CoalesceEngine::processNextApplyEvent()
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
+            uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            // TODO: Is this correct?
-            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
 
-            if (cacheBlocks[block_index].items[i].prop != old_prop) {
-                if (peerPushEngine->recvWLItem(
-                    cacheBlocks[block_index].items[i])) {
+            if (new_prop != old_prop) {
+                if (peerPushEngine->allocatePushSpace()) {
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(
+                                        cacheBlocks[block_index].items[i]);
                     DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5563cca7c..8cfe3c72cc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,7 +97,7 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
-bool
+void
 PushEngine::recvWLItem(WorkListItem wl)
 {
     // If there are no outdoing edges, no need to generate and push
@@ -105,14 +105,14 @@ PushEngine::recvWLItem(WorkListItem wl)
     if (wl.degree == 0) {
         DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
                     __func__, wl.to_string());
-        return true;
+        return;
     }
 
     assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() <= pushReqQueueSize));
-    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
-        return false;
-    }
+        (pushReqQueue.size() < pushReqQueueSize));
+    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
+                "method after checking if there is enough push space. Use "
+                "allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
@@ -125,7 +125,6 @@ PushEngine::recvWLItem(WorkListItem wl)
         (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
-    return true;
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce24f862ba..ae465f6eb1 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -149,7 +149,8 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool recvWLItem(WorkListItem wl);
+    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    void recvWLItem(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
 

From c79edd70e85cfe4c33851024a47fd979a163c941 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 17:49:06 -0700
Subject: [PATCH 105/279] Fixing an incorrect assertion.

---
 configs/accl/sega.py                   | 23 +++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |  2 +-
 src/accl/graph/sega/push_engine.cc     |  1 -
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8a6ac783c3..11e2cfb6af 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=64,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
@@ -109,7 +109,12 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
+    def __init__(self,
+                num_mpus,
+                vertex_cache_line_size,
+                graph_path,
+                first_addr,
+                first_value):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -122,7 +127,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -147,13 +152,19 @@ def get_inputs():
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, \
+            args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, \
+        graph_path, first_addr, first_value = get_inputs()
+
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, \
+                graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 39144972df..dd651f9e5a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -241,7 +241,7 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
+    assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8cfe3c72cc..ed23fb4d4b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -176,7 +176,6 @@ PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
-    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool

From 5e28bea0a9bfa825d1011c67a00e78207f9a965b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 3 Jun 2022 07:44:25 -0700
Subject: [PATCH 106/279] Converting apply and evict queues to FIFOSet.

---
 src/accl/graph/base/data_structs.hh    | 50 +++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc | 68 +++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.hh     |  3 +-
 4 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 7535d4bbac..e03686a7e9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,6 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
+#include <unordered_set>
+#include <queue>
+
 namespace gem5
 {
 
@@ -83,6 +86,53 @@ struct __attribute__ ((packed)) Edge
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+
+template<typename T>
+class FIFOSet
+{
+    private:
+        std::queue<T> fifo;
+        std::unordered_set<T> set;
+
+    public:
+        FIFOSet(int cap)
+        {
+            set.reserve(cap);
+        }
+
+        void push_back(T item)
+        {
+            if (set.find(item) == set.end()) {
+                set.insert(item);
+                fifo.push(item);
+            }
+        }
+
+        void pop_front()
+        {
+            T front = fifo.front();
+            set.erase(front);
+            fifo.pop();
+        }
+
+        T& front()
+        {
+            return fifo.front();
+        }
+
+        size_t size() {
+            return fifo.size();
+        }
+
+        bool empty() {
+            return fifo.empty();
+        }
+
+        bool find(T item) {
+            return (set.find(item) != set.end());
+        }
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dd651f9e5a..f96adbf8d8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,6 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     pendingPushAlarm(false),
+    applyQueue(numLines),
+    evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -55,6 +57,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+
     peerPushEngine->registerCoalesceEngine(this);
 }
 
@@ -141,14 +144,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
+                    if (!cacheBlocks[block_index].busyMask) {
+                        applyQueue.push_back(block_index);
+                        assert(!applyQueue.empty());
+                        if ((!nextApplyEvent.scheduled()) &&
+                            (!pendingPushAlarm)) {
+                            schedule(nextApplyEvent, nextCycle());
+                        }
+                    }
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    //TODO: Fix this to work with new inheritance.
-                    // assert(
-                    //     outstandingMemReqQueue.size() <=
-                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
@@ -278,8 +285,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
-
+        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
@@ -333,7 +339,7 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
@@ -359,18 +365,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
-        bool found = false;
-        for (auto i : applyQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            applyQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                    __func__, block_index, applyQueue.size());
-        }
+        applyQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
@@ -395,15 +392,9 @@ CoalesceEngine::processNextApplyEvent()
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict) {
             evictQueue.push_back(block_index);
+            assert(!evictQueue.empty());
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
         }
@@ -435,15 +426,7 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
         // TODO: This is where eviction policy goes
-        // TODO: Fix this hack.
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict){
             evictQueue.push_back(block_index);
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
@@ -469,15 +452,8 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    bool found_in_apply_queue = false;
-    for (auto i : applyQueue) {
-        if (i == block_index) {
-            found_in_apply_queue = true;
-            break;
-        }
-    }
     if ((cacheBlocks[block_index].busyMask) ||
-        (found_in_apply_queue)) {
+        (applyQueue.find(block_index))) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
@@ -514,8 +490,8 @@ CoalesceEngine::processNextEvictEvent()
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
+                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 824faef10d..177bb067ab 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,9 +84,9 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     bool pendingPushAlarm;
-    std::deque<int> applyQueue;
+    FIFOSet<int> applyQueue;
 
-    std::deque<int> evictQueue;
+    FIFOSet<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ae465f6eb1..c93b3b386d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -31,6 +31,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -59,7 +60,7 @@ class PushEngine : public BaseMemEngine
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
-            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 

From 80e37587ad43d87e4d81e279c579663f78414950 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 13 Jun 2022 14:48:49 -0700
Subject: [PATCH 107/279] Moving delete pkt in push_engine.cc.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ed23fb4d4b..cb71b73c60 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -228,8 +228,8 @@ PushEngine::processNextPushEvent()
         reqOffsetMap.erase(pkt->req);
         reqNumEdgeMap.erase(pkt->req);
         reqValueMap.erase(pkt->req);
-        delete pkt;
         memRespQueue.pop_front();
+        delete pkt;
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {

From 6c706e03e1aee11df7767f3ce122adc523a8c2a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 19 Jun 2022 14:29:57 -0700
Subject: [PATCH 108/279] Enforced limited length on memRespQueue in
 PushEngine.

---
 configs/accl/sega.py                   | 15 +++++---
 src/accl/graph/SConscript              |  3 +-
 src/accl/graph/base/BaseMemEngine.py   |  2 ++
 src/accl/graph/base/base_mem_engine.cc | 49 +++++++++++++++++---------
 src/accl/graph/base/base_mem_engine.hh |  4 +++
 src/accl/graph/sega/coalesce_engine.cc |  5 ++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     |  5 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       | 12 +++++--
 10 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 11e2cfb6af..a5dd759f1f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,16 +9,21 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=64,
-                                    attached_memory_atom_size=64)
+                                    push_req_queue_size=1,
+                                    attached_memory_atom_size=64,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=16)
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=32,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 00fa2466dd..9663d3f263 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,4 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
\ No newline at end of file
+DebugFlag('MPU')
+DebugFlag('SEGAQSize')
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
index 69f68e9dfc..2ecb6659d8 100644
--- a/src/accl/graph/base/BaseMemEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -43,3 +43,5 @@ class BaseMemEngine(ClockedObject):
 
     attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
                                     "memory.")
+
+    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index f02f1d2feb..112b0d63cb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,8 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
+
 namespace gem5
 {
 
@@ -37,6 +39,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    onTheFlyReqs(0),
+    respQueueSize(params.resp_queue_size),
     memAlarmRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
@@ -73,7 +77,7 @@ bool
 BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
+    return owner->recvTimingResp(pkt);
 }
 
 void
@@ -98,20 +102,25 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    // TODO: Maybe add a DPRINTF here.
-    PacketPtr pkt = outstandingMemReqQueue.front();
-    memPort.sendPacket(pkt);
-    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
-                "pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    outstandingMemReqQueue.pop_front();
-
-    if (memAlarmRequested &&
-        (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - memSpaceRequested))) {
-        memAlarmRequested = false;
-        memSpaceRequested = 0;
-        respondToMemAlarm();
+    if ((respBuffSize() == -1) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+        PacketPtr pkt = outstandingMemReqQueue.front();
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                    "pkt->addr: %lu, pkt->size: %lu.\n",
+                    __func__, pkt->getAddr(), pkt->getSize());
+        outstandingMemReqQueue.pop_front();
+        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
+
+        if (memAlarmRequested &&
+            (outstandingMemReqQueue.size() <=
+            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            memAlarmRequested = false;
+            memSpaceRequested = 0;
+            respondToMemAlarm();
+        }
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -171,7 +180,8 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-
+    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
@@ -197,4 +207,11 @@ BaseMemEngine::wakeUp()
     }
 }
 
+bool
+BaseMemEngine::recvTimingResp(PacketPtr pkt)
+{
+    onTheFlyReqs--;
+    return handleMemResp(pkt);
+}
+
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 8a18807e2e..fc67f3f6d8 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,6 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
+    int onTheFlyReqs;
+    int respQueueSize;
     bool memAlarmRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
@@ -90,6 +92,7 @@ class BaseMemEngine : public ClockedObject
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
+    virtual int respBuffSize() = 0;
     virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
@@ -109,6 +112,7 @@ class BaseMemEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
+    bool recvTimingResp(PacketPtr pkt);
     void recvFunctional(PacketPtr pkt);
 
     void wakeUp();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f96adbf8d8..ee1e3f85ff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -141,11 +141,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
-                                "line[%d]", __func__, addr, block_index);
+                                "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
+                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                                    "applyQueue.size = %u.\n", __func__,
+                                    block_index, applyQueue.size());
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled()) &&
                             (!pendingPushAlarm)) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 177bb067ab..1e353c11b8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
+    virtual int respBuffSize() { return -1; }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cb71b73c60..a045bbdead 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -93,6 +93,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                    __func__, _blocked ? "true" : "false",
+                    (blockedPacket == nullptr) ? "true" : "false");
+
         blockedPacket = nullptr;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c93b3b386d..2c17501d5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -140,6 +140,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
+    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 75ac4f784e..55a9147ac9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,6 +136,9 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n",
+                        __func__, onTheFlyUpdateMap.size());
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
@@ -147,6 +150,10 @@ WLEngine::processNextReadEvent()
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
+        } else {
+            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
+                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -209,8 +216,9 @@ WLEngine::processNextReduceEvent()
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap.size: %lu.\n",
+                    __func__, addr, onTheFlyUpdateMap.size());
     }
     addrWorkListMap.clear();
 }

From 74afbe46fa9f81d85ca650292b3ccfcb0a1ee446 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Jul 2022 10:36:46 -0700
Subject: [PATCH 109/279] Adding bit vector implementation for caching push
 meta data.

---
 configs/accl/sega.py                   |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  10 +-
 src/accl/graph/base/data_structs.hh    |  86 +++++++++-------
 src/accl/graph/sega/CoalesceEngine.py  |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 137 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  14 ++-
 src/accl/graph/sega/push_engine.cc     |  62 ++++++++---
 src/accl/graph/sega/push_engine.hh     |  12 ++-
 8 files changed, 227 insertions(+), 104 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a5dd759f1f..96408aa185 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=1,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,8 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=2)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
@@ -77,7 +76,7 @@ def __init__(self,
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
-                            bandwidth="19.2GB/s",
+                            bandwidth="4.8GB/s",
                             latency="30ns",
                             image_file=f"{graph_path}/edgelist_{i}")
             )
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 112b0d63cb..3086b81fc2 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
 
 namespace gem5
 {
@@ -102,8 +101,8 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    if ((respBuffSize() == -1) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
+        (respQueueSize == 0)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -111,8 +110,6 @@ BaseMemEngine::processNextMemReqEvent()
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
-        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
 
         if (memAlarmRequested &&
             (outstandingMemReqQueue.size() <=
@@ -180,8 +177,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
+
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e03686a7e9..e30d6029cb 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,8 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <unordered_set>
+#include <bitset>
 #include <queue>
+#include <unordered_set>
 
 namespace gem5
 {
@@ -91,49 +92,64 @@ static_assert(isPowerOf2(sizeof(Edge)));
 template<typename T>
 class FIFOSet
 {
-    private:
-        std::queue<T> fifo;
-        std::unordered_set<T> set;
-
-    public:
-        FIFOSet(int cap)
-        {
-            set.reserve(cap);
-        }
+  private:
+    std::queue<T> fifo;
+    std::unordered_set<T> set;
 
-        void push_back(T item)
-        {
-            if (set.find(item) == set.end()) {
-                set.insert(item);
-                fifo.push(item);
-            }
-        }
+  public:
+    FIFOSet(int cap)
+    {
+        set.reserve(cap);
+    }
 
-        void pop_front()
-        {
-            T front = fifo.front();
-            set.erase(front);
-            fifo.pop();
+    void push_back(T item)
+    {
+        if (set.find(item) == set.end()) {
+            set.insert(item);
+            fifo.push(item);
         }
+    }
 
-        T& front()
-        {
-            return fifo.front();
-        }
+    void pop_front()
+    {
+        T front = fifo.front();
+        set.erase(front);
+        fifo.pop();
+    }
 
-        size_t size() {
-            return fifo.size();
-        }
+    T& front()
+    {
+        return fifo.front();
+    }
 
-        bool empty() {
-            return fifo.empty();
-        }
+    size_t size() {
+        return fifo.size();
+    }
 
-        bool find(T item) {
-            return (set.find(item) != set.end());
-        }
+    bool empty() {
+        return fifo.empty();
+    }
+
+    bool find(T item) {
+        return (set.find(item) != set.end());
+    }
 };
 
+// template<int SIZE>
+// class BitVector
+// {
+//   private:
+//     int it;
+//     std::bitset<SIZE> bitStore;
+
+//   public:
+//     BitVector(): it(0) { bitStore.reset(); }
+
+//     uint32_t next() {
+
+//     }
+// };
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 086f284950..7667a22c5a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -41,4 +41,7 @@ class CoalesceEngine(BaseMemEngine):
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
+    # Don't change. If changed. It will break functionality of coalesce.
+    resp_queue_size = 0
+
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ee1e3f85ff..b5eeae694e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,7 +44,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    pendingPushAlarm(false),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
@@ -58,7 +57,9 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
 
-    peerPushEngine->registerCoalesceEngine(this);
+    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
+
+    needsApply.reset();
 }
 
 void
@@ -67,6 +68,38 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     sendMemFunctional(pkt);
 }
 
+void
+CoalesceEngine::startup()
+{
+    AddrRangeList vertex_ranges = getAddrRanges();
+
+    bool found = false;
+    Addr first_match_addr = 0;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(first_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        first_match_addr += peerMemoryAtomSize;
+    }
+
+    found = false;
+    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(second_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        second_match_addr += peerMemoryAtomSize;
+    }
+
+    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+}
+
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -150,8 +183,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled()) &&
-                            (!pendingPushAlarm)) {
+                        if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
                         }
                     }
@@ -363,18 +395,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        // TODO: Fix this hack
         applyQueue.push_back(block_index);
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
-        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -393,14 +423,7 @@ CoalesceEngine::processNextApplyEvent()
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
-                    "needed. Adding the cache line to evict schedule.\n",
-                    __func__, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
-            assert(!evictQueue.empty());
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+                    "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -409,31 +432,38 @@ CoalesceEngine::processNextApplyEvent()
                                 cacheBlocks[block_index].items[i].tempProp);
 
             if (new_prop != old_prop) {
-                if (peerPushEngine->allocatePushSpace()) {
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
-                    __func__,
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(
-                                        cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
-                    __func__,
-                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+
+                Addr block_addr = cacheBlocks[block_index].addr;
+                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
+                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+                int bit_index = atom_index * block_bits + i;
+
+                if (needsApply[bit_index] == 1) {
+                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
+                                " Not doing anything further.\n", __func__,
+                                block_addr + (i * sizeof(WorkListItem)));
                 } else {
-                    peerPushEngine->setPushAlarm();
-                    pendingPushAlarm = true;
-                    return;
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsApply[bit_index] = 1;
+                    }
                 }
             }
         }
-        // TODO: This is where eviction policy goes
-        if (cacheBlocks[block_index].hasConflict){
-            evictQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+    }
+
+    // TODO: This is where eviction policy goes
+    if (cacheBlocks[block_index].hasConflict){
+        evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                __func__, block_index, evictQueue.size());
     }
 
     applyQueue.pop_front();
@@ -536,9 +566,42 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::respondToPushAlarm()
 {
-    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
-    pendingPushAlarm = false;
-    schedule(nextApplyEvent, nextCycle());
+    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    int it;
+    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        uint32_t slice = 0;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            slice <<= 1;
+            slice |= needsApply[it + i];
+        }
+        if (slice) {
+            break;
+        }
+    }
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+                __func__, slice, it);
+
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+
+    if ((cacheBlocks[block_index].addr == block_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // hit in cache
+        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], do_push);
+        }
+
+        // TODO: Should we add block_index to evict_queue?
+        if (do_push && cacheBlocks[block_index].hasConflict) {
+            evictQueue.push_back(block_index);
+        }
+    } else {
+        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 1e353c11b8..e6c70502af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,12 +29,16 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
+#include <bitset>
+
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+#define MAX_BITVECTOR_SIZE (1 << 30)
+
 // TODO: Add parameters for size, memory atom size, type size,
 // length of items in the blocks.
 namespace gem5
@@ -68,6 +72,7 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+    int nmpu;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -83,8 +88,9 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    bool pendingPushAlarm;
     FIFOSet<int> applyQueue;
+    int needsApplyFirstPointer;
+    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
 
     FIFOSet<int> evictQueue;
 
@@ -127,14 +133,16 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    void recvFunctional(PacketPtr pkt);
-
     bool recvReadAddr(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
     void respondToPushAlarm();
+
+    void recvFunctional(PacketPtr pkt);
+
+    virtual void startup();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a045bbdead..8bc2d55a28 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -37,9 +38,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    pushAlarmSet(false),
+    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
+    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -59,9 +61,11 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                    int elements_per_line)
 {
     peerCoalesceEngine = coalesce_engine;
+    numElementsPerLine = elements_per_line;
 }
 
 void
@@ -115,15 +119,21 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
-    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
-                "method after checking if there is enough push space. Use "
-                "allocatePushSpace.\n");
+    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
+            (pushReqQueueSize != 0), "You should call this method after "
+            "checking if there is enough push space. Use allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+
+    if (curTick() % 50000 == 0) {
+        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
+                                __func__, pushReqQueue.size());
+    }
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -132,6 +142,25 @@ PushEngine::recvWLItem(WorkListItem wl)
     }
 }
 
+void
+PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+{
+    if (do_push) {
+        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+        uint32_t value = wl.prop;
+
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
+        numRetries--;
+    }
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::processNextAddrGenEvent()
 {
@@ -158,8 +187,10 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
-            pushAlarmSet = false;
+        if (numRetries > 0) {
+            retrySpaceAllocated++;
+        }
+        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
             peerCoalesceEngine->respondToPushAlarm();
         }
     }
@@ -261,17 +292,20 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-void
-PushEngine::setPushAlarm()
-{
-    assert(!pushAlarmSet);
-    pushAlarmSet = true;
+bool
+PushEngine::allocatePushSpace() {
+    if ((pushReqQueueSize == 0) ||
+        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        return true;
+    } else {
+        numRetries++;
+        return false;
+    }
 }
 
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2c17501d5b..4f388cd7e6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -98,13 +98,15 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
-    bool pushAlarmSet;
+    int numElementsPerLine;
+    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
+    int numRetries;
     int pushReqQueueSize;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
@@ -151,12 +153,14 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    bool allocatePushSpace();
+
     void recvWLItem(WorkListItem wl);
 
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+    void recvWLItemRetry(WorkListItem wl, bool do_push);
 
-    void setPushAlarm();
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                          int elements_per_line);
 };
 
 }

From b77bc190256b0197e2b91d5c8c1f4b7bbd5ec2a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Jul 2022 16:12:07 -0700
Subject: [PATCH 110/279] Completing retry between coalesce and push engine.

---
 configs/accl/sega.py                   |   4 +-
 src/accl/graph/SConscript              |   1 +
 src/accl/graph/TODO.md                 |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  13 ++-
 src/accl/graph/base/data_structs.hh    |   3 +-
 src/accl/graph/sega/coalesce_engine.cc | 155 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  12 ++
 src/accl/graph/sega/push_engine.cc     |  11 +-
 8 files changed, 157 insertions(+), 49 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 96408aa185..65645b3bb3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=2)
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 9663d3f263..36e16affa3 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -29,3 +29,4 @@ Import('*')
 
 DebugFlag('MPU')
 DebugFlag('SEGAQSize')
+DebugFlag('MahyarMath')
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 29b5a2939e..ebfca7e794 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,5 +1,8 @@
 # TODO Items
 
-* Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 3086b81fc2..64aaa3a737 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -159,17 +159,22 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemReqSpace(int space)
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
     return (
-        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        (outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memReqQueueFull()
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    return (
+        (outstandingMemReqQueueSize != 0) &&
+        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
 }
 
 void
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e30d6029cb..9c250c6a2f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,6 +93,7 @@ template<typename T>
 class FIFOSet
 {
   private:
+    // int numInvalids;
     std::queue<T> fifo;
     std::unordered_set<T> set;
 
@@ -127,7 +128,7 @@ class FIFOSet
     }
 
     bool empty() {
-        return fifo.empty();
+        return (size() == 0);
     }
 
     bool find(T item) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b5eeae694e..1c3f2bcadf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -75,29 +76,39 @@ CoalesceEngine::startup()
 
     bool found = false;
     Addr first_match_addr = 0;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(first_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         first_match_addr += peerMemoryAtomSize;
     }
 
     found = false;
     Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(second_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         second_match_addr += peerMemoryAtomSize;
     }
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    memoryAddressOffset = first_match_addr;
+    DPRINTF(MahyarMath, "%s: Initialized address translation information."
+                        " nmpu: %d, memoryAddressOffset: %lu.\n",
+                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -106,6 +117,40 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
+                        __func__, addr);
+    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    int bit_index = atom_index * block_bits;
+    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
+                        __func__, addr, bit_index);
+    return bit_index;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
+                        __func__, index);
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
+                        __func__, index, (block_addr + memoryAddressOffset));
+    return (block_addr + memoryAddressOffset);
+}
+
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
@@ -298,6 +343,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    if (pkt->findNextSenderState<SenderState>()) {
+        Addr addr = pkt->getAddr();
+        int it = getBitIndexBase(addr);
+        int block_index = getBlockIndex(addr);
+        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
+
+        // We have to send the items regardless of them being found in the
+        // cache. However, if they are found in the cache, two things should
+        // happen. First, do_push should be set to false and the bit vector
+        // value for the items should not change. To future Mahyar and Marjan,
+        // If this is confusing, please look at where each item is pushed to
+        // the apply queue. Hint: Think about updates that might not be sent
+        // out if you reset the bit regardless of the line being found in the
+        // cache.
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[it + i] =
+                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
+
+            peerPushEngine->recvWLItemRetry(items[i],
+                ((!found_in_cache) && needsApply[it + i]));
+        }
+        return true;
+    }
+
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
@@ -395,11 +465,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
+        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[bit_index + i] = 0;
+        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -438,22 +512,15 @@ CoalesceEngine::processNextApplyEvent()
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
 
-                Addr block_addr = cacheBlocks[block_index].addr;
-                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
-                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-                int bit_index = atom_index * block_bits + i;
+                int bit_index =
+                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
 
-                if (needsApply[bit_index] == 1) {
-                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
-                                " Not doing anything further.\n", __func__,
-                                block_addr + (i * sizeof(WorkListItem)));
+                assert(needsApply[bit_index] == 0);
+                if (peerPushEngine->allocatePushSpace()) {
+                    peerPushEngine->recvWLItem(
+                        cacheBlocks[block_index].items[i]);
                 } else {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
-                    } else {
-                        needsApply[bit_index] = 1;
-                    }
+                    needsApply[bit_index] = 1;
                 }
             }
         }
@@ -567,40 +634,56 @@ void
 CoalesceEngine::respondToPushAlarm()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
-    int it;
+    Addr block_addr = 0;
+    int block_index = 0;
+    int it = 0;
+    uint32_t slice = 0;
+    bool hit_in_cache = false;
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        uint32_t slice = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsApply[it + i];
         }
         if (slice) {
-            break;
+            block_addr = getBlockAddrFromBitIndex(it);
+            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            if ((cacheBlocks[block_index].addr == block_addr) &&
+                (cacheBlocks[block_index].valid)) {
+                if (cacheBlocks[block_index].busyMask == 0) {
+                    hit_in_cache = true;
+                    break;
+                }
+            } else {
+                hit_in_cache = false;
+                break;
+            }
         }
     }
+
+    assert(it < MAX_BITVECTOR_SIZE);
+
     DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
                 __func__, slice, it);
 
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
-
-    if ((cacheBlocks[block_index].addr == block_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        // hit in cache
-        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+    if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
-            peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], do_push);
-        }
-
-        // TODO: Should we add block_index to evict_queue?
-        if (do_push && cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
+            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
+                                                    (needsApply[it + i] == 1));
+            needsApply[it + i] = 0;
         }
     } else {
+        // FIXME: Fix the retry mechanism between memory and cache to
+        // handle memory retries correctly. This probably requires scheduling
+        // an event for sending the retry. For now we're enabling infinite
+        // queueing in the outstandingMemReqQueue.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-
+        SenderState* sender_state = new SenderState(true);
+        pkt->pushSenderState(sender_state);
+        if (allocateMemReqSpace(1)) {
+            enqueueMemReq(pkt);
+        } else {
+            requestMemAlarm(1);
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e6c70502af..973ea479c1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -72,7 +72,15 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+
     int nmpu;
+    Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -94,6 +102,10 @@ class CoalesceEngine : public BaseMemEngine
 
     FIFOSet<int> evictQueue;
 
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8bc2d55a28..fa611392b4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -149,9 +149,13 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
+        if (wl.degree != 0) {
+            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                            peerMemoryAtomSize, value);
+        } else {
+            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        }
         numRetries--;
     }
     retrySpaceAllocated--;
@@ -164,7 +168,6 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 void
 PushEngine::processNextAddrGenEvent()
 {
-
     Addr aligned_addr, offset;
     int num_edges;
 

From 5742163933ffb0fdca1854d3d855c983b1b3310a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 07:33:16 -0700
Subject: [PATCH 111/279] Updating variable names and debug flags.

---
 src/accl/graph/SConscript              |   3 +-
 src/accl/graph/base/base_mem_engine.cc |  20 ++---
 src/accl/graph/base/base_mem_engine.hh |  12 +--
 src/accl/graph/base/data_structs.hh    |  33 +++-----
 src/accl/graph/sega/SConscript         |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 100 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |   9 +--
 src/accl/graph/sega/push_engine.cc     |  53 ++++++-------
 src/accl/graph/sega/push_engine.hh     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 +-
 10 files changed, 113 insertions(+), 124 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 36e16affa3..7ca60c30bd 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,4 @@
 Import('*')
 
 DebugFlag('MPU')
-DebugFlag('SEGAQSize')
-DebugFlag('MahyarMath')
+# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 64aaa3a737..32c314033d 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,7 +40,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
-    memAlarmRequested(false),
+    memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
@@ -111,12 +111,12 @@ BaseMemEngine::processNextMemReqEvent()
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
 
-        if (memAlarmRequested &&
+        if (memRetryRequested &&
             (outstandingMemReqQueue.size() <=
             (outstandingMemReqQueueSize - memSpaceRequested))) {
-            memAlarmRequested = false;
+            memRetryRequested = false;
             memSpaceRequested = 0;
-            respondToMemAlarm();
+            recvMemRetry();
         }
     }
 
@@ -157,7 +157,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::allocateMemReqSpace(int space)
+BaseMemEngine::allocateMemQueueSpace(int space)
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -168,7 +168,7 @@ BaseMemEngine::allocateMemReqSpace(int space)
 }
 
 bool
-BaseMemEngine::memReqQueueFull()
+BaseMemEngine::memQueueFull()
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -180,7 +180,7 @@ BaseMemEngine::memReqQueueFull()
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
-    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
     assert(!outstandingMemReqQueue.empty());
@@ -190,12 +190,12 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestMemAlarm(int space) {
-    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
+BaseMemEngine::requestMemRetry(int space) {
+    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memAlarmRequested = true;
+    memRetryRequested = true;
     memSpaceRequested = space;
 }
 
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fc67f3f6d8..64ef49ee1d 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -71,7 +71,7 @@ class BaseMemEngine : public ClockedObject
     int outstandingMemReqQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
-    bool memAlarmRequested;
+    bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
@@ -83,17 +83,17 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    bool allocateMemReqSpace(int space);
-    bool memReqQueueFull();
+    bool allocateMemQueueSpace(int space);
+    bool memQueueFull();
 
-    bool pendingMemAlarm() { return memAlarmRequested; }
-    void requestMemAlarm(int space);
+    bool pendingMemRetry() { return memRetryRequested; }
+    void requestMemRetry(int space);
 
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
     virtual int respBuffSize() = 0;
-    virtual void respondToMemAlarm() = 0;
+    virtual void recvMemRetry() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 9c250c6a2f..f938be72f1 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,8 +93,6 @@ template<typename T>
 class FIFOSet
 {
   private:
-    // int numInvalids;
-    std::queue<T> fifo;
     std::unordered_set<T> set;
 
   public:
@@ -107,24 +105,22 @@ class FIFOSet
     {
         if (set.find(item) == set.end()) {
             set.insert(item);
-            fifo.push(item);
         }
     }
 
     void pop_front()
     {
-        T front = fifo.front();
-        set.erase(front);
-        fifo.pop();
+        assert(set.begin() != set.end());
+        set.erase(set.begin());
     }
 
-    T& front()
+    T front()
     {
-        return fifo.front();
+        return *(set.begin());
     }
 
     size_t size() {
-        return fifo.size();
+        return set.size();
     }
 
     bool empty() {
@@ -134,22 +130,11 @@ class FIFOSet
     bool find(T item) {
         return (set.find(item) != set.end());
     }
-};
-
-// template<int SIZE>
-// class BitVector
-// {
-//   private:
-//     int it;
-//     std::bitset<SIZE> bitStore;
-
-//   public:
-//     BitVector(): it(0) { bitStore.reset(); }
 
-//     uint32_t next() {
-
-//     }
-// };
+    void erase(T item) {
+        set.erase(item);
+    }
+};
 
 }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 16fab86ede..77e508f4ed 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,3 +39,6 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
+DebugFlag('CoalesceEngine')
+DebugFlag('PushEngine')
+DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c3f2bcadf..66b8e1fad7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -60,7 +59,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
 
     peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
 
-    needsApply.reset();
+    needsPush.reset();
 }
 
 void
@@ -106,9 +105,6 @@ CoalesceEngine::startup()
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
     memoryAddressOffset = first_match_addr;
-    DPRINTF(MahyarMath, "%s: Initialized address translation information."
-                        " nmpu: %d, memoryAddressOffset: %lu.\n",
-                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -128,13 +124,9 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
-                        __func__, addr);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
-    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
-                        __func__, addr, bit_index);
     return bit_index;
 }
 
@@ -142,17 +134,13 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
-                        __func__, index);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
-                        __func__, index, (block_addr + memoryAddressOffset));
     return (block_addr + memoryAddressOffset);
 }
 
 bool
-CoalesceEngine::recvReadAddr(Addr addr)
+CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
@@ -239,7 +227,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memReqQueueFull()) {
+                    if (memQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
@@ -326,7 +314,7 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToMemAlarm()
+CoalesceEngine::recvMemRetry()
 {
     assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
@@ -347,8 +335,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
-        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
 
+        bool line_do_push = false;
+        if (cacheBlocks[block_index].addr == addr) {
+            if (cacheBlocks[block_index].busyMask == 0) {
+                assert(applyQueue.find(block_index));
+                line_do_push = true;
+            } else {
+                line_do_push = false;
+            }
+        }
         // We have to send the items regardless of them being found in the
         // cache. However, if they are found in the cache, two things should
         // happen. First, do_push should be set to false and the bit vector
@@ -359,11 +355,19 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // cache.
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[it + i] =
-                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
-
+            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(items[i],
-                ((!found_in_cache) && needsApply[it + i]));
+                (line_do_push && needsPush[it + i]));
+        }
+
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
         return true;
     }
@@ -470,10 +474,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[bit_index + i] = 0;
-        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -488,6 +488,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
+    if (applyQueue.empty()) {
+        return;
+    }
+
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -514,13 +518,13 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-
-                assert(needsApply[bit_index] == 0);
-                if (peerPushEngine->allocatePushSpace()) {
-                    peerPushEngine->recvWLItem(
-                        cacheBlocks[block_index].items[i]);
-                } else {
-                    needsApply[bit_index] = 1;
+                if (cacheBlocks[block_index].items[i].degree != 0) {
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsPush[bit_index] = 1;
+                    }
                 }
             }
         }
@@ -536,7 +540,7 @@ CoalesceEngine::processNextApplyEvent()
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingMemAlarm()) &&
+        (!pendingMemRetry()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -562,13 +566,13 @@ CoalesceEngine::processNextEvictEvent()
         int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemReqSpace(space_needed)) {
+        if (!allocateMemQueueSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestMemAlarm(space_needed);
+            requestMemRetry(space_needed);
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
@@ -631,7 +635,7 @@ CoalesceEngine::processNextEvictEvent()
 }
 
 void
-CoalesceEngine::respondToPushAlarm()
+CoalesceEngine::recvPushRetry()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
     Addr block_addr = 0;
@@ -639,14 +643,15 @@ CoalesceEngine::respondToPushAlarm()
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
+
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
-            slice |= needsApply[it + i];
+            slice |= needsPush[it + i];
         }
         if (slice) {
             block_addr = getBlockAddrFromBitIndex(it);
-            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            block_index = getBlockIndex(block_addr);
             if ((cacheBlocks[block_index].addr == block_addr) &&
                 (cacheBlocks[block_index].valid)) {
                 if (cacheBlocks[block_index].busyMask == 0) {
@@ -662,14 +667,23 @@ CoalesceEngine::respondToPushAlarm()
 
     assert(it < MAX_BITVECTOR_SIZE);
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
                 __func__, slice, it);
 
     if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsApply[it + i] == 1));
-            needsApply[it + i] = 0;
+                                                    (needsPush[it + i] == 1));
+            needsPush[it + i] = 0;
+        }
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -679,10 +693,10 @@ CoalesceEngine::respondToPushAlarm()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemReqSpace(1)) {
+        if (allocateMemQueueSpace(1)) {
             enqueueMemReq(pkt);
         } else {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 973ea479c1..0fa555c84a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -97,8 +97,7 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     FIFOSet<int> applyQueue;
-    int needsApplyFirstPointer;
-    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     FIFOSet<int> evictQueue;
 
@@ -137,7 +136,7 @@ class CoalesceEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return -1; }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,12 +144,12 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    bool recvReadAddr(Addr addr);
+    bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
-    void respondToPushAlarm();
+    void recvPushRetry();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index fa611392b4..16e0ca6c6c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,7 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
+#include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -109,13 +109,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    // If there are no outdoing edges, no need to generate and push
-    // updates. Therefore, we only need to return true.
-    if (wl.degree == 0) {
-        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        return;
-    }
+    assert(wl.degree != 0);
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
@@ -123,6 +117,7 @@ PushEngine::recvWLItem(WorkListItem wl)
             (pushReqQueueSize != 0), "You should call this method after "
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
+    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
@@ -130,14 +125,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    if (curTick() % 50000 == 0) {
-        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
-                                __func__, pushReqQueue.size());
-    }
-
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
+        (!memQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
 }
@@ -145,24 +135,22 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 {
+    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
+                __func__, wl.to_string(), do_push ? "true" : "false");
     if (do_push) {
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-        if (wl.degree != 0) {
-            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                            peerMemoryAtomSize, value);
-        } else {
-            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        }
+        assert(wl.degree != 0);
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
         numRetries--;
+        if ((!nextAddrGenEvent.scheduled()) &&
+            (!memQueueFull())) {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
     retrySpaceAllocated--;
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
-    }
 }
 
 void
@@ -173,7 +161,7 @@ PushEngine::processNextAddrGenEvent()
 
     PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(MPU, "%s: Current packet information generated by "
+    DPRINTF(PushEngine, "%s: Current packet information generated by "
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
@@ -185,22 +173,22 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
-        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
+        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
         if (numRetries > 0) {
             retrySpaceAllocated++;
         }
         if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->respondToPushAlarm();
+            peerCoalesceEngine->recvPushRetry();
         }
     }
 
-    if (memReqQueueFull()) {
+    if (memQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
         return;
     }
@@ -211,9 +199,10 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToMemAlarm()
+PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
+    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4f388cd7e6..11122067d6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -143,7 +143,7 @@ class PushEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return memRespQueue.size(); }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 55a9147ac9..27ba5c40c8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -139,7 +139,7 @@ WLEngine::processNextReadEvent()
             DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
                         "onTheFlyUpdateMap.size: %lu.\n",
                         __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From ebcf6b8575111414a46cdfc6686f5f38f4c80a4a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 14:33:22 -0700
Subject: [PATCH 112/279] Somewhat fixing the correctness.

---
 src/accl/graph/sega/coalesce_engine.cc | 97 +++++++++++++++++---------
 src/accl/graph/sega/push_engine.cc     |  3 +-
 2 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66b8e1fad7..274d85a5b1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -117,6 +117,7 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     return ((int) (addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -124,6 +125,7 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
@@ -134,6 +136,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
     return (block_addr + memoryAddressOffset);
@@ -336,39 +339,62 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
 
-        bool line_do_push = false;
-        if (cacheBlocks[block_index].addr == addr) {
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            // We read the address to send the wl but it is put in cache before
+            // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
-                assert(applyQueue.find(block_index));
-                line_do_push = true;
+                // It is not busy anymore, we have to send the wl from cache.
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    // TODO: Make this more programmable
+                    uint32_t new_prop = std::min(
+                                        cacheBlocks[block_index].items[i].prop,
+                                        cacheBlocks[block_index].items[i].tempProp);
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    needsPush[it + i] = 0;
+                }
+                // Since we have just applied the line, we can take it out of
+                // the applyQueue if it's in there. No need to do the same
+                // thing for evictQueue.
+                if (applyQueue.find(block_index)) {
+                    applyQueue.erase(block_index);
+                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                        deschedule(nextApplyEvent);
+                    }
+                }
             } else {
-                line_do_push = false;
+                // The line is busy. Therefore, we have to disregard the data
+                // we received from the memory and also tell the push engine to
+                // deallocate the space it allocated for this retry. However,
+                // we still have to rememeber that these items need a retry.
+                // i.e. don't change needsPush, call recvWLItemRetry with
+                // do_push = false
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    peerPushEngine->recvWLItemRetry(
+                                    cacheBlocks[block_index].items[i], false);
+                }
+            }
+        } else {
+            // We have read the address to send the wl and it is not in the
+            // cache. Simply send the items to the PushEngine.
+            WorkListItem* items = pkt->getPtr<WorkListItem>();
+            // No applying of the line needed.
+            for (int i = 0; i < numElementsPerLine; i++) {
+                assert(!((needsPush[it + i] == 1) &&
+                                (items[i].degree == 0)));
+                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                needsPush[it + i] = 0;
             }
-        }
-        // We have to send the items regardless of them being found in the
-        // cache. However, if they are found in the cache, two things should
-        // happen. First, do_push should be set to false and the bit vector
-        // value for the items should not change. To future Mahyar and Marjan,
-        // If this is confusing, please look at where each item is pushed to
-        // the apply queue. Hint: Think about updates that might not be sent
-        // out if you reset the bit regardless of the line being found in the
-        // cache.
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
-        for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(items[i],
-                (line_do_push && needsPush[it + i]));
         }
 
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-        }
+        delete pkt;
         return true;
     }
 
@@ -488,9 +514,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    if (applyQueue.empty()) {
-        return;
-    }
+    // if (applyQueue.empty()) {
+    //     return;
+    // }
 
     int block_index = applyQueue.front();
 
@@ -515,10 +541,12 @@ CoalesceEngine::processNextApplyEvent()
                 DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if (cacheBlocks[block_index].items[i].degree != 0) {
+                if ((cacheBlocks[block_index].items[i].degree != 0) &&
+                    (needsPush[bit_index] == 0)) {
+                    // If the respective bit in the bit vector is set
+                    // there is no need to try and resend it.
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
                             cacheBlocks[block_index].items[i]);
@@ -684,6 +712,9 @@ CoalesceEngine::recvPushRetry()
         }
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
+            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                deschedule(nextApplyEvent);
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 16e0ca6c6c..044429f8fc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,12 +97,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        blockedPacket = nullptr;
         DPRINTF(MPU, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
-
-        blockedPacket = nullptr;
     }
 }
 

From ca7eb6ce2faff5fc8b3a059b57ced9385a62956a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 01:31:49 -0700
Subject: [PATCH 113/279] Almost fixed retry bugs. 14 wrong vertices in lj.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  18 ++---
 src/accl/graph/sega/coalesce_engine.cc |  95 ++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   5 ++
 src/accl/graph/sega/push_engine.cc     | 101 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     |   4 +-
 6 files changed, 170 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 65645b3bb3..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -174,5 +174,5 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print("Simulation finished!")
+    print(f"Exited simulation because {exit_event.getCause()}")
     exit()
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 32c314033d..e05357950b 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -97,12 +97,8 @@ BaseMemEngine::MemPort::recvReqRetry()
 void
 BaseMemEngine::processNextMemReqEvent()
 {
-    if (memPort.blocked()) {
-        return;
-    }
-
-    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
-        (respQueueSize == 0)) {
+    if ((respQueueSize == 0) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -120,7 +116,8 @@ BaseMemEngine::processNextMemReqEvent()
         }
     }
 
-    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+    if ((!memPort.blocked()) &&
+        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -183,8 +180,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
-    assert(!outstandingMemReqQueue.empty());
-    if (!nextMemReqEvent.scheduled()) {
+    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -202,8 +198,8 @@ BaseMemEngine::requestMemRetry(int space) {
 void
 BaseMemEngine::wakeUp()
 {
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
+    assert(!nextMemReqEvent.scheduled());
+    if (!outstandingMemReqQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 274d85a5b1..dde6e46aa9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -44,11 +45,14 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    currentBitSliceIndex(0),
+    numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -344,6 +348,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // We read the address to send the wl but it is put in cache before
             // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as idle.\n",
+                        __func__, addr);
+                int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
@@ -354,10 +362,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                         cacheBlocks[block_index].items[i].tempProp);
                     cacheBlocks[block_index].items[i].tempProp = new_prop;
                     cacheBlocks[block_index].items[i].prop = new_prop;
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    if (needsPush[it + i] == 1) {
+                        peerPushEngine->recvWLItemRetry(
+                            cacheBlocks[block_index].items[i]);
+                    }
+                    push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                peerPushEngine->deallocatePushSpace(
+                                        numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -366,6 +379,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                         deschedule(nextApplyEvent);
                     }
+                    if (cacheBlocks[block_index].hasConflict) {
+                        evictQueue.push_back(block_index);
+                        if ((!nextEvictEvent.scheduled()) &&
+                            (!pendingMemRetry())) {
+                            schedule(nextEvictEvent, nextCycle());
+                        }
+                    }
                 }
             } else {
                 // The line is busy. Therefore, we have to disregard the data
@@ -374,24 +394,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // we still have to rememeber that these items need a retry.
                 // i.e. don't change needsPush, call recvWLItemRetry with
                 // do_push = false
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    peerPushEngine->recvWLItemRetry(
-                                    cacheBlocks[block_index].items[i], false);
-                }
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as busy.\n",
+                        __func__, addr);
+                peerPushEngine->deallocatePushSpace(numElementsPerLine);
             }
         } else {
             // We have read the address to send the wl and it is not in the
             // cache. Simply send the items to the PushEngine.
+            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was not found in the cache.\n",
+                        __func__, addr);
             WorkListItem* items = pkt->getPtr<WorkListItem>();
+            int push_needed = 0;
             // No applying of the line needed.
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
-                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                if (needsPush[it + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(items[i]);
+                }
+                push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            peerPushEngine->deallocatePushSpace(
+                                    numElementsPerLine - push_needed);
         }
 
         delete pkt;
@@ -514,10 +541,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    // if (applyQueue.empty()) {
-    //     return;
-    // }
-
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -665,14 +688,23 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::recvPushRetry()
 {
-    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    numRetriesReceived++;
+    if (!nextSendRetryEvent.scheduled()) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextSendRetryEvent()
+{
+    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -691,14 +723,23 @@ CoalesceEngine::recvPushRetry()
                 break;
             }
         }
+        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
+            it = 0;
+        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
+    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+        currentBitSliceIndex = 0;
+    } else {
+        currentBitSliceIndex = it + numElementsPerLine;
+    }
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
-                __func__, slice, it);
+    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+                        "in needsPush.\n", __func__, slice, it);
 
     if (hit_in_cache) {
+        int push_needed = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -706,15 +747,26 @@ CoalesceEngine::recvPushRetry()
                                 cacheBlocks[block_index].items[i].tempProp);
             cacheBlocks[block_index].items[i].tempProp = new_prop;
             cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsPush[it + i] == 1));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(
+                    cacheBlocks[block_index].items[i]);
+            }
+            push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                 deschedule(nextApplyEvent);
             }
+            if (cacheBlocks[block_index].hasConflict) {
+                evictQueue.push_back(block_index);
+                if ((!nextEvictEvent.scheduled()) &&
+                    (!pendingMemRetry())) {
+                    schedule(nextEvictEvent, nextCycle());
+                }
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -730,6 +782,11 @@ CoalesceEngine::recvPushRetry()
             requestMemRetry(1);
         }
     }
+
+    numRetriesReceived--;
+    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0fa555c84a..e1033a4622 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,6 +96,8 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    int currentBitSliceIndex;
+    int numRetriesReceived;
     FIFOSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -114,6 +116,9 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextEvictEvent;
     void processNextEvictEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 044429f8fc..d493b34c53 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -105,6 +105,35 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
+void
+PushEngine::deallocatePushSpace(int space)
+{
+    retrySpaceAllocated -= space;
+    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
+            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
+            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
+            __func__, space, numRetries,
+            nextAddrGenEvent.scheduled() ? "true" : "false",
+            pendingMemRetry() ? "true" : "false",
+            pushReqQueue.size(), retrySpaceAllocated);
+    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
+    // and or the pushReqQueue is empty. If so we might need to
+    // send retries.
+    if ((numRetries > 0)  &&
+        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+        int free_space =
+            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        if (free_space > numElementsPerLine) {
+            DPRINTF(PushEngine, "%s: Found %d free spaces. "
+                    "retrySpaceAllocated = %d.\n", __func__, free_space,
+                    retrySpaceAllocated);
+            retrySpaceAllocated += numElementsPerLine;
+            peerCoalesceEngine->recvPushRetry();
+        }
+    }
+}
+
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
@@ -124,32 +153,41 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    assert(!pushReqQueue.empty());
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+PushEngine::recvWLItemRetry(WorkListItem wl)
 {
-    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
-                __func__, wl.to_string(), do_push ? "true" : "false");
-    if (do_push) {
-        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
-        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-        uint32_t value = wl.prop;
-        assert(wl.degree != 0);
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
-        numRetries--;
-        if ((!nextAddrGenEvent.scheduled()) &&
-            (!memQueueFull())) {
+    assert(wl.degree != 0);
+    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
+                                __func__, wl.to_string());
+
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t value = wl.prop;
+
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+    numRetries--;
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
             schedule(nextAddrGenEvent, nextCycle());
         }
     }
-    retrySpaceAllocated--;
 }
 
 void
@@ -177,11 +215,27 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if ((numRetries > 0) &&
+        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        //     retrySpaceAllocated++;
+        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
+        //                     "retrySpaceAllocated = %d.\n",
+        //                     __func__, retrySpaceAllocated);
+        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
         if (numRetries > 0) {
-            retrySpaceAllocated++;
-        }
-        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->recvPushRetry();
+            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+            DPRINTF(PushEngine, "%s: Found %d free spaces in "
+                            "the pushReqQueue.\n", __func__, free_space);
+            if (free_space > numElementsPerLine) {
+                retrySpaceAllocated += numElementsPerLine;
+                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+                        "retrySpaceAllocated = %d.\n", __func__, free_space,
+                        retrySpaceAllocated);
+                peerCoalesceEngine->recvPushRetry();
+            }
         }
     }
 
@@ -201,7 +255,7 @@ void
 PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
@@ -285,6 +339,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
+    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
         return true;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 11122067d6..9025ae9946 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -155,9 +155,11 @@ class PushEngine : public BaseMemEngine
 
     bool allocatePushSpace();
 
+    void deallocatePushSpace(int space);
+
     void recvWLItem(WorkListItem wl);
 
-    void recvWLItemRetry(WorkListItem wl, bool do_push);
+    void recvWLItemRetry(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);

From ec9b0e83cb9a8dca2f222723e553f79c786a79b3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 11:36:14 -0700
Subject: [PATCH 114/279] Deleting comments and updating config.

---
 configs/accl/sega.py               | 14 +++++++-------
 src/accl/graph/sega/push_engine.cc | 14 ++------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..15431088d2 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=64,
+                                    resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=4,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                on_the_fly_update_map_size=16)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d493b34c53..e87f4d275e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,16 +215,6 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if ((numRetries > 0) &&
-        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        //     retrySpaceAllocated++;
-        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
-        //                     "retrySpaceAllocated = %d.\n",
-        //                     __func__, retrySpaceAllocated);
-        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
         if (numRetries > 0) {
             int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
             DPRINTF(PushEngine, "%s: Found %d free spaces in "
@@ -232,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
             if (free_space > numElementsPerLine) {
                 retrySpaceAllocated += numElementsPerLine;
                 DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__, free_space,
-                        retrySpaceAllocated);
+                        "retrySpaceAllocated = %d.\n", __func__,
+                        numElementsPerLine, retrySpaceAllocated);
                 peerCoalesceEngine->recvPushRetry();
             }
         }

From 6a3fea5b84c2be9851da9978eb194d7a0ea4d3c3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 14:12:33 -0700
Subject: [PATCH 115/279] Adding a new debug print.

---
 src/accl/graph/sega/coalesce_engine.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dde6e46aa9..e7e528aaf5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -353,6 +353,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -369,6 +371,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
@@ -397,7 +401,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 DPRINTF(CoalesceEngine, "%s: Received read response for retry "
                         "for addr %lu. It was found in the cache as busy.\n",
                         __func__, addr);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             }
         } else {
             // We have read the address to send the wl and it is not in the
@@ -408,6 +416,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem* items = pkt->getPtr<WorkListItem>();
             int push_needed = 0;
             // No applying of the line needed.
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -417,6 +427,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
         }
@@ -740,6 +752,8 @@ CoalesceEngine::processNextSendRetryEvent()
 
     if (hit_in_cache) {
         int push_needed = 0;
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -754,6 +768,8 @@ CoalesceEngine::processNextSendRetryEvent()
             push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);

From 6ee0b0fe019cb5322e441fecad139d45dac8da7d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 15:51:58 -0700
Subject: [PATCH 116/279] Updating debug flags. Adding one per comp.

---
 configs/accl/sega.py                   | 14 ++--
 src/accl/graph/SConscript              |  4 +-
 src/accl/graph/base/SConscript         |  1 +
 src/accl/graph/base/base_mem_engine.cc |  6 +-
 src/accl/graph/sega/coalesce_engine.cc | 91 +++++++++++++-------------
 src/accl/graph/sega/push_engine.cc     |  9 ++-
 src/accl/graph/sega/wl_engine.cc       | 44 ++++++-------
 7 files changed, 82 insertions(+), 87 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 15431088d2..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=32,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=64,
-                                    resp_queue_size=64)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=4,
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                on_the_fly_update_map_size=16)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7ca60c30bd..f5f7e962af 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
-# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 4c90dfa9a6..45877a12ca 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,3 +32,4 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
+DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index e05357950b..cb4c1d81bb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/BaseMemEngine.hh"
 
 namespace gem5
 {
@@ -102,7 +102,7 @@ BaseMemEngine::processNextMemReqEvent()
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
@@ -190,7 +190,7 @@ BaseMemEngine::requestMemRetry(int space) {
     panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
-    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
+    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
     memRetryRequested = true;
     memSpaceRequested = space;
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e7e528aaf5..522feebace 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -150,7 +149,7 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
-    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
@@ -167,7 +166,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset,
             cacheBlocks[block_index].items[wl_offset].to_string(),
@@ -184,28 +183,28 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
-                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
-                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
@@ -213,13 +212,13 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
-                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
@@ -231,11 +230,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
                     if (memQueueFull()) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
@@ -245,19 +244,19 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
                                 "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
 
                     PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -265,10 +264,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -276,7 +275,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
@@ -289,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHRMap[block_index].push_back(addr);
-            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
@@ -306,11 +305,11 @@ CoalesceEngine::processNextRespondEvent()
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                 "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
@@ -333,7 +332,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
         delete pkt;
-        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
@@ -440,7 +439,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
-    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
@@ -449,7 +448,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                                 peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
@@ -462,13 +461,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -477,7 +476,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
@@ -490,7 +489,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
@@ -517,7 +516,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -529,17 +528,17 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
@@ -556,12 +555,12 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -596,7 +595,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         evictQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                 __func__, block_index, evictQueue.size());
     }
 
@@ -621,7 +620,7 @@ CoalesceEngine::processNextEvictEvent()
 
     if ((cacheBlocks[block_index].busyMask) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
@@ -630,7 +629,7 @@ CoalesceEngine::processNextEvictEvent()
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
         if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
@@ -639,12 +638,12 @@ CoalesceEngine::processNextEvictEvent()
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
-                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
                     cacheBlocks[block_index].addr, peerMemoryAtomSize,
                     (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
                             "size = %d.\n", __func__,
                             write_pkt->getAddr(), write_pkt->getSize());
                 enqueueMemReq(write_pkt);
@@ -653,7 +652,7 @@ CoalesceEngine::processNextEvictEvent()
             if (cacheBlocks[block_index].hasConflict) {
                 assert(!MSHRMap[block_index].empty());
                 Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
@@ -661,7 +660,7 @@ CoalesceEngine::processNextEvictEvent()
 
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                             " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr,
                             read_pkt->getAddr(), read_pkt->getSize());
@@ -673,7 +672,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
@@ -683,7 +682,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
         }
@@ -709,7 +708,7 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e87f4d275e..f17619942b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
-#include "debug/MPU.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -91,14 +90,14 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
-    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
     sendPacket(blockedPacket);
 
     if (!_blocked) {
         blockedPacket = nullptr;
-        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
@@ -273,7 +272,7 @@ PushEngine::processNextPushEvent()
     assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
@@ -287,7 +286,7 @@ PushEngine::processNextPushEvent()
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
         stats.numUpdates++;
-        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27ba5c40c8..9d4fb9cbe9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -73,7 +73,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
+        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -129,45 +129,38 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n",
-                        __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
                             update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
-        } else {
-            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
-                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
                     "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
                     "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
         respPort.checkRetryReq();
@@ -185,7 +178,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
 
     addrWorkListMap[addr] = wl;
-    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
                 " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
                 __func__, addr, wl.to_string());
 
@@ -202,7 +195,7 @@ WLEngine::processNextReduceEvent()
         Addr addr = it.first;
         assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
                     "addrWorkListMap[%lu] = %s.\n", __func__,
                                 addr, onTheFlyUpdateMap[addr],
@@ -210,15 +203,14 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         addrWorkListMap[addr].tempProp =
                     std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
                     __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap.size: %lu.\n",
-                    __func__, addr, onTheFlyUpdateMap.size());
+        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, addr);
     }
     addrWorkListMap.clear();
 }
@@ -231,8 +223,12 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
+    if (curTick() == ) {
+        std
+    }
+
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
+    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From 5ebb51dcea223ca46280ad501ae7634df1a05f9f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 16:11:12 -0700
Subject: [PATCH 117/279] Removing accidentally commented out wrong code.

---
 src/accl/graph/sega/wl_engine.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9d4fb9cbe9..70a921c48a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -223,10 +223,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    if (curTick() == ) {
-        std
-    }
-
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",

From 4946d9a46ca3f217b9538dd4bd1d5ffb65096632 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 14:23:35 -0700
Subject: [PATCH 118/279] Adding in between counter for retry.

---
 src/accl/graph/sega/push_engine.cc | 59 +++++++++++++++++++++---------
 src/accl/graph/sega/push_engine.hh |  5 ++-
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index f17619942b..0c2b3deb3f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -37,11 +37,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
+    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -118,16 +117,28 @@ PushEngine::deallocatePushSpace(int space)
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    if ((numRetries > 0)  &&
-        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    // if ((numRetries > 0)  &&
+    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    //     int free_space =
+    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+    //     if (free_space > numElementsPerLine) {
+    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
+    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
+    //                 retrySpaceAllocated);
+    //         retrySpaceAllocated += numElementsPerLine;
+    //         peerCoalesceEngine->recvPushRetry();
+    //     }
+    // }
+
+    if (numRetries > 0) {
         int free_space =
             pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        if (free_space > numElementsPerLine) {
-            DPRINTF(PushEngine, "%s: Found %d free spaces. "
-                    "retrySpaceAllocated = %d.\n", __func__, free_space,
-                    retrySpaceAllocated);
-            retrySpaceAllocated += numElementsPerLine;
+        assert(free_space <= numElementsPerLine);
+        retrySpaceAllocated += free_space;
+        spacesAllocatedBetweenRetries += free_space;
+        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
+            spacesAllocatedBetweenRetries %= numElementsPerLine;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -214,15 +225,26 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if (numRetries > 0) {
+        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
+        //                     "the pushReqQueue.\n", __func__, free_space);
+        //     if (free_space > numElementsPerLine) {
+        //         retrySpaceAllocated += numElementsPerLine;
+        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+        //                 "retrySpaceAllocated = %d.\n", __func__,
+        //                 numElementsPerLine, retrySpaceAllocated);
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
+
         if (numRetries > 0) {
-            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-            DPRINTF(PushEngine, "%s: Found %d free spaces in "
-                            "the pushReqQueue.\n", __func__, free_space);
-            if (free_space > numElementsPerLine) {
-                retrySpaceAllocated += numElementsPerLine;
-                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__,
-                        numElementsPerLine, retrySpaceAllocated);
+            retrySpaceAllocated++;
+            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
+                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
+            spacesAllocatedBetweenRetries++;
+            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
+                spacesAllocatedBetweenRetries = 0;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -331,6 +353,7 @@ PushEngine::allocatePushSpace() {
     assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        assert(numRetries == 0);
         return true;
     } else {
         numRetries++;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9025ae9946..cd79139bbc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -99,15 +99,16 @@ class PushEngine : public BaseMemEngine
     };
 
     int numElementsPerLine;
-    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
-    int numRetries;
     int pushReqQueueSize;
+    int numRetries;
+    int retrySpaceAllocated;
+    int spacesAllocatedBetweenRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 70a921c48a..79bf046ba3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,7 +136,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From 42ca800f3de786471d37235878f60f84d91ffe1e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 23:24:32 -0700
Subject: [PATCH 119/279] Fixing the retry mechanism.

---
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++--
 src/accl/graph/sega/push_engine.cc     | 89 +++++++++-----------------
 src/accl/graph/sega/push_engine.hh     |  9 ++-
 3 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 522feebace..b3167a0e95 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -354,6 +354,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // It is not busy anymore, we have to send the wl from cache.
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -374,6 +375,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -402,7 +404,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
             }
@@ -417,6 +421,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // No applying of the line needed.
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -430,6 +435,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
         }
 
         delete pkt;
@@ -708,6 +714,13 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
+    if (needsPush.count() == 0) {
+        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+                        "bit in needsPush. Rejecting the retry.\n", __func__);
+        peerPushEngine->recvRetryReject();
+        return;
+    }
+
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
@@ -715,7 +728,8 @@ CoalesceEngine::processNextSendRetryEvent()
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -734,9 +748,6 @@ CoalesceEngine::processNextSendRetryEvent()
                 break;
             }
         }
-        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
-            it = 0;
-        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
@@ -753,6 +764,7 @@ CoalesceEngine::processNextSendRetryEvent()
         int push_needed = 0;
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -770,6 +782,7 @@ CoalesceEngine::processNextSendRetryEvent()
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0c2b3deb3f..6db91734fe 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
+    numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -106,39 +106,22 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::deallocatePushSpace(int space)
 {
-    retrySpaceAllocated -= space;
-    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
-            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
-            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
-            __func__, space, numRetries,
-            nextAddrGenEvent.scheduled() ? "true" : "false",
-            pendingMemRetry() ? "true" : "false",
-            pushReqQueue.size(), retrySpaceAllocated);
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    // if ((numRetries > 0)  &&
-    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
-    //     int free_space =
-    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-    //     if (free_space > numElementsPerLine) {
-    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
-    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
-    //                 retrySpaceAllocated);
-    //         retrySpaceAllocated += numElementsPerLine;
-    //         peerCoalesceEngine->recvPushRetry();
-    //     }
-    // }
-
-    if (numRetries > 0) {
-        int free_space =
-            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        assert(free_space <= numElementsPerLine);
-        retrySpaceAllocated += free_space;
-        spacesAllocatedBetweenRetries += free_space;
-        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
-            spacesAllocatedBetweenRetries %= numElementsPerLine;
+    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
+                                                __func__, space);
+    numPendingRetries--;
+    if (numTotalRetries > 0) {
+        int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
+                            "free spaces.\n", __func__, free_space);
+        if ((free_space > numElementsPerLine) &&
+            (numTotalRetries >= numPendingRetries)) {
+            DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+            numPendingRetries++;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -162,6 +145,8 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
 
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
@@ -187,8 +172,10 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
-    numRetries--;
-    retrySpaceAllocated--;
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
+
+    numTotalRetries--;
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
             if (!pendingMemRetry()) {
@@ -225,26 +212,16 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if (numRetries > 0) {
-        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
-        //                     "the pushReqQueue.\n", __func__, free_space);
-        //     if (free_space > numElementsPerLine) {
-        //         retrySpaceAllocated += numElementsPerLine;
-        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-        //                 "retrySpaceAllocated = %d.\n", __func__,
-        //                 numElementsPerLine, retrySpaceAllocated);
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
-
-        if (numRetries > 0) {
-            retrySpaceAllocated++;
-            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
-                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
-            spacesAllocatedBetweenRetries++;
-            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
-                spacesAllocatedBetweenRetries = 0;
+        if (numTotalRetries > 0) {
+            int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                        "free spaces.\n", __func__, free_space);
+            if ((free_space > numElementsPerLine) &&
+                (numTotalRetries >= numPendingRetries)) {
+                DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+                numPendingRetries++;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -350,13 +327,11 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
-    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        assert(numRetries == 0);
+        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
         return true;
     } else {
-        numRetries++;
+        numTotalRetries++;
         return false;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cd79139bbc..a3a308554f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -106,9 +106,8 @@ class PushEngine : public BaseMemEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    int numRetries;
-    int retrySpaceAllocated;
-    int spacesAllocatedBetweenRetries;
+    int numTotalRetries;
+    int numPendingRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
@@ -164,6 +163,10 @@ class PushEngine : public BaseMemEngine
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
+
+    int getNumRetries() { return numTotalRetries; }
+
+    void recvRetryReject() { numPendingRetries--; }
 };
 
 }

From 17836488931114f979c90440b0ca0de0b6028d2b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 15:59:31 -0700
Subject: [PATCH 120/279] Limiting retries to one.

---
 src/accl/graph/sega/push_engine.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6db91734fe..ab2962b253 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -118,7 +118,7 @@ PushEngine::deallocatePushSpace(int space)
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
         if ((free_space > numElementsPerLine) &&
-            (numTotalRetries >= numPendingRetries)) {
+            (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
             numPendingRetries++;
@@ -218,7 +218,7 @@ PushEngine::processNextAddrGenEvent()
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
             if ((free_space > numElementsPerLine) &&
-                (numTotalRetries >= numPendingRetries)) {
+                (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
                 numPendingRetries++;

From 11777e3f97e343afff019bb40dd47e428952e47f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 17:28:51 -0700
Subject: [PATCH 121/279] Adding MemoryEvent class and nextReadOnMissEvent.

---
 src/accl/graph/sega/coalesce_engine.cc | 42 +++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh | 21 +++++++++++--
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b3167a0e95..033c1f3363 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
+    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -175,7 +176,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -233,9 +233,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memQueueFull()) {
-                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
-                                    "Rejecting  request.\n", __func__);
+                    if (lineFillBuffer.size() == numMSHREntry) {
+                        DPRINTF(CoalesceEngine,  "%s: No space left in "
+                            "lineFillBuffer. Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
                     }
@@ -255,9 +255,15 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
-                    enqueueMemReq(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
-                                                                    __func__);
+                    // enqueueMemReq(pkt);
+                    lineFillBuffer.push_back(pkt);
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
+                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                            __func__, lineFillBuffer.size());
+                    if ((!nextReadOnMissEvent.pending()) &&
+                        (!nextReadOnMissEvent.scheduled())) {
+                        schedule(nextReadOnMissEvent, nextCycle());
+                    }
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -296,6 +302,28 @@ CoalesceEngine::recvWLRead(Addr addr)
     }
 }
 
+void
+CoalesceEngine::processNextReadOnMissEvent()
+{
+    if (memQueueFull()) {
+        nextReadOnMissEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
+
+    PacketPtr pkt = lineFillBuffer.front();
+    enqueueMemReq(pkt);
+
+    lineFillBuffer.pop_front();
+
+    if (!lineFillBuffer.empty()) {
+        assert(!nextReadOnMissEvent.scheduled());
+        assert(!nextReadOnMissEvent.pending());
+        schedule(nextReadOnMissEvent, nextCycle());
+    }
+}
+
 // TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e1033a4622..05fa555ec8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,6 +49,20 @@ class WLEngine;
 class CoalesceEngine : public BaseMemEngine
 {
   private:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     struct Block
     {
         WorkListItem* items;
@@ -93,7 +107,7 @@ class CoalesceEngine : public BaseMemEngine
     int numMSHREntry;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
-
+    std::deque<PacketPtr> lineFillBuffer;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
@@ -107,13 +121,16 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    MemoryEvent nextReadOnMissEvent;
+    void processNextReadOnMissEvent();
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    EventFunctionWrapper nextEvictEvent;
+    MemoryEvent nextEvictEvent;
     void processNextEvictEvent();
 
     EventFunctionWrapper nextSendRetryEvent;

From f47a3f186ccc80fd077d30dd58e31e4a1e7abc9e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 22:04:08 -0700
Subject: [PATCH 122/279] Restructuring events and adding nextWriteBackEvent.

---
 src/accl/graph/base/data_structs.hh    |   4 +-
 src/accl/graph/sega/coalesce_engine.cc | 290 ++++++++++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  21 +-
 src/accl/graph/sega/push_engine.cc     |   4 +-
 4 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f938be72f1..f178d5a7e2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,13 +90,13 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class FIFOSet
+class InOutSet
 {
   private:
     std::unordered_set<T> set;
 
   public:
-    FIFOSet(int cap)
+    InOutSet(int cap)
     {
         set.reserve(cap);
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 033c1f3363..ddbd22a8b5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,16 +42,17 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntry(params.num_mshr_entry),
+    numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     currentBitSliceIndex(0),
     numRetriesReceived(0),
     applyQueue(numLines),
-    evictQueue(numLines),
-    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
+    writeBackQueue(numLines),
+    replaceQueue(numLines),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
@@ -149,7 +150,7 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHRMap.size() <= numMSHREntry);
+    assert(MSHR.size() <= numMSHREntries);
     DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
@@ -184,11 +185,11 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHRMap.size() <= numMSHREntry);
-            if (MSHRMap.size() == numMSHREntry) {
+            assert(MSHR.size() <= numMSHREntries);
+            if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
@@ -199,24 +200,26 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
-                    MSHRMap[block_index].push_back(addr);
+                    MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-                    if (!cacheBlocks[block_index].busyMask) {
+
+                    if ((cacheBlocks[block_index].busyMask == 0) &&
+                        (cacheBlocks[block_index].valid)) {
                         applyQueue.push_back(block_index);
                         DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
@@ -230,39 +233,31 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
-                                "allocate a cache line for it.\n",
-                                __func__, addr);
-                    if (lineFillBuffer.size() == numMSHREntry) {
-                        DPRINTF(CoalesceEngine,  "%s: No space left in "
-                            "lineFillBuffer. Rejecting  request.\n", __func__);
-                        stats.readRejections++;
-                        return false;
-                    }
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
-                                "Addr: %lu.\n", __func__, block_index, addr);
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
 
-                    MSHRMap[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    MSHR[block_index].push_back(addr);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = %d.\n",
-                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     // enqueueMemReq(pkt);
-                    lineFillBuffer.push_back(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
-                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                            __func__, lineFillBuffer.size());
-                    if ((!nextReadOnMissEvent.pending()) &&
-                        (!nextReadOnMissEvent.scheduled())) {
-                        schedule(nextReadOnMissEvent, nextCycle());
+                    fillQueue.push_back(block_index);
+                    // FIXME: Fix this DPRINTF
+                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
+                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                    //         __func__, fillQueue.size());
+                    if ((!nextMemoryReadEvent.pending()) &&
+                        (!nextMemoryReadEvent.scheduled())) {
+                        schedule(nextMemoryReadEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -270,10 +265,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
-            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -293,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readHitUnderMisses++;
             }
 
-            MSHRMap[block_index].push_back(addr);
+            MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -303,24 +298,29 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextReadOnMissEvent()
+CoalesceEngine::processNextMemoryReadEvent()
 {
     if (memQueueFull()) {
-        nextReadOnMissEvent.sleep();
+        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         return;
     }
 
-    PacketPtr pkt = lineFillBuffer.front();
+    int block_index = fillQueue.front();
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
     enqueueMemReq(pkt);
 
-    lineFillBuffer.pop_front();
+    fillQueue.pop_front();
 
-    if (!lineFillBuffer.empty()) {
-        assert(!nextReadOnMissEvent.scheduled());
-        assert(!nextReadOnMissEvent.pending());
-        schedule(nextReadOnMissEvent, nextCycle());
+    if (!fillQueue.empty()) {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(!nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -347,11 +347,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
+// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!nextEvictEvent.scheduled());
-    schedule(nextEvictEvent, nextCycle());
+    // assert(!nextEvictEvent.scheduled());
+    // schedule(nextEvictEvent, nextCycle());
+    return;
 }
 
 bool
@@ -413,10 +415,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        evictQueue.push_back(block_index);
-                        if ((!nextEvictEvent.scheduled()) &&
-                            (!pendingMemRetry())) {
-                            schedule(nextEvictEvent, nextCycle());
+                        writeBackQueue.push_back(block_index);
+                        if ((!nextWriteBackEvent.pending()) &&
+                            (!nextWriteBackEvent.scheduled())) {
+                            schedule(nextWriteBackEvent, nextCycle());
                         }
                     }
                 }
@@ -477,7 +479,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -490,18 +492,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
-        Addr miss_addr = MSHRMap[block_index][i];
+    for (int i = 0; i < MSHR[block_index].size(); i++) {
+        Addr miss_addr = MSHR[block_index][i];
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -510,25 +512,25 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
 
     // TODO: We Can use taken instead of this
-    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
     int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHRMap[block_index][i - bias];
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
+        Addr print_addr = MSHR[block_index][i - bias];
+        MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
-    if (MSHRMap[block_index].empty()) {
-        MSHRMap.erase(block_index);
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
         assert(cacheBlocks[block_index].hasConflict);
@@ -562,13 +564,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
@@ -588,13 +590,13 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
+    if (cacheBlocks[block_index].busyMask != 0) {
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -628,17 +630,17 @@ CoalesceEngine::processNextApplyEvent()
 
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
-        evictQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                __func__, block_index, evictQueue.size());
+        writeBackQueue.push_back(block_index);
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
+                __func__, block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!pendingMemRetry()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if ((!writeBackQueue.empty()) &&
+        (!nextWriteBackEvent.pending()) &&
+        (!nextWriteBackEvent.scheduled())) {
+        schedule(nextWriteBackEvent, nextCycle());
     }
 
     if ((!applyQueue.empty()) &&
@@ -648,85 +650,64 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextEvictEvent()
+CoalesceEngine::processNextWriteBackEvent()
 {
-    int block_index = evictQueue.front();
+    if (memQueueFull()) {
+        nextWriteBackEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
 
-    if ((cacheBlocks[block_index].busyMask) ||
+    int block_index = writeBackQueue.front();
+
+    // Why would we write it back if it does not have a conflict?
+    assert(cacheBlocks[block_index].hasConflict);
+
+    if ((cacheBlocks[block_index].busyMask != 0) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                "writeback process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
+        // FIXME: Fix the name of this stat.
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].dirty ?
-                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
-                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. dirty: %d, "
-                    "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].dirty,
-                    cacheBlocks[block_index].hasConflict);
-            requestMemRetry(space_needed);
-            return;
-        } else {
-            if (cacheBlocks[block_index].dirty) {
-                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
-                            __func__, block_index);
-                PacketPtr write_pkt = createWritePacket(
-                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                    (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
-                            "size = %d.\n", __func__,
-                            write_pkt->getAddr(), write_pkt->getSize());
-                enqueueMemReq(write_pkt);
-            }
-
-            if (cacheBlocks[block_index].hasConflict) {
-                assert(!MSHRMap[block_index].empty());
-                Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
-                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
-
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                        peerMemoryAtomSize);
-                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr,
-                            read_pkt->getAddr(), read_pkt->getSize());
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
-                            __func__, block_index, aligned_miss_addr);
-            } else {
-
-                // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
-                            __func__, block_index);
-            }
+        if (cacheBlocks[block_index].dirty) {
+            DPRINTF(CoalesceEngine,  "%s: Change observed on "
+                    "cacheBlocks[%d].\n", __func__, block_index);
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        write_pkt->getAddr(), write_pkt->getSize());
+            enqueueMemReq(write_pkt);
         }
+        assert(!MSHR[block_index].empty());
+        Addr miss_addr = MSHR[block_index].front();
+        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
+                                    "cacheBlocks[%d] is Addr: %lu.\n",
+                                    __func__, block_index, miss_addr);
+        Addr aligned_miss_addr =
+            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].allocated = true;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = true;
+        cacheBlocks[block_index].dirty = false;
+        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
+                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
+        fillQueue.push_back(block_index);
     }
 
-    evictQueue.pop_front();
+    writeBackQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if (!writeBackQueue.empty()) {
+        assert(!nextWriteBackEvent.pending());
+        assert(!nextWriteBackEvent.scheduled());
+        schedule(nextWriteBackEvent, nextCycle());
     }
 }
 
@@ -817,10 +798,11 @@ CoalesceEngine::processNextSendRetryEvent()
                 deschedule(nextApplyEvent);
             }
             if (cacheBlocks[block_index].hasConflict) {
-                evictQueue.push_back(block_index);
-                if ((!nextEvictEvent.scheduled()) &&
-                    (!pendingMemRetry())) {
-                    schedule(nextEvictEvent, nextCycle());
+                writeBackQueue.push_back(block_index);
+                if ((!writeBackQueue.empty()) &&
+                    (!nextWriteBackEvent.pending()) &&
+                    (!nextWriteBackEvent.scheduled())) {
+                    schedule(nextWriteBackEvent, nextCycle());
                 }
             }
         }
@@ -829,6 +811,8 @@ CoalesceEngine::processNextSendRetryEvent()
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
         // queueing in the outstandingMemReqQueue.
+        // FIXME: Also do not send requests for cache lines that are already
+        // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05fa555ec8..563fa671b3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -104,25 +104,28 @@ class CoalesceEngine : public BaseMemEngine
     int numLines;
     int numElementsPerLine;
 
-    int numMSHREntry;
+    int numMSHREntries;
     int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHRMap;
-    std::deque<PacketPtr> lineFillBuffer;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    std::deque<int> fillQueue;
+
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
     int numRetriesReceived;
-    FIFOSet<int> applyQueue;
+    InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    FIFOSet<int> evictQueue;
+    InOutSet<int> writeBackQueue;
+    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
-    MemoryEvent nextReadOnMissEvent;
-    void processNextReadOnMissEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -130,8 +133,8 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextEvictEvent;
-    void processNextEvictEvent();
+    MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent();
 
     EventFunctionWrapper nextSendRetryEvent;
     void processNextSendRetryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ab2962b253..5ab8db401c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -117,7 +117,7 @@ PushEngine::deallocatePushSpace(int space)
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
-        if ((free_space > numElementsPerLine) &&
+        if ((free_space >= numElementsPerLine) &&
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
-            if ((free_space > numElementsPerLine) &&
+            if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);

From 66f146de50a0b25c653bc8124f6ac71d0571b5cb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 23:57:58 -0700
Subject: [PATCH 123/279] Implemented MemoryEvent retry mechanism.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 87 ++++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  5 +-
 src/accl/graph/sega/push_engine.cc     | 17 +++--
 src/accl/graph/sega/push_engine.hh     |  3 +
 5 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..ffd74241e7 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    outstanding_mem_req_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ddbd22a8b5..4a0600e9c0 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -300,10 +300,16 @@ CoalesceEngine::recvWLRead(Addr addr)
 void
 CoalesceEngine::processNextMemoryReadEvent()
 {
+    assert(!nextMemoryReadEvent.pending());
     if (memQueueFull()) {
-        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        nextMemoryReadEvent.sleep();
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextMemoryReadEvent");
         return;
     }
 
@@ -351,8 +357,33 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
-    // assert(!nextEvictEvent.scheduled());
-    // schedule(nextEvictEvent, nextCycle());
+    assert(!pendingEventQueue.empty());
+    std::string front = pendingEventQueue.front();
+
+    if (front == "nextMemoryReadEvent") {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
+        nextMemoryReadEvent.wake();
+    } else if (front == "nextWriteBackEvent") {
+        assert(!nextWriteBackEvent.scheduled());
+        assert(nextWriteBackEvent.pending());
+        schedule(nextWriteBackEvent, nextCycle());
+        nextWriteBackEvent.wake();
+    } else if (front == "nextSendRetryEvent") {
+        assert(!nextSendRetryEvent.scheduled());
+        assert(nextSendRetryEvent.pending());
+        breakPointFunction();
+        schedule(nextSendRetryEvent, nextCycle());
+        nextSendRetryEvent.wake();
+    } else {
+        panic("EVENT IS NOT RECOGNIZED.\n");
+    }
+
+    pendingEventQueue.pop_front();
+    if (!pendingEventQueue.empty()) {
+        requestMemRetry(1);
+    }
     return;
 }
 
@@ -652,10 +683,16 @@ CoalesceEngine::processNextApplyEvent()
 void
 CoalesceEngine::processNextWriteBackEvent()
 {
+    assert(!nextWriteBackEvent.pending());
     if (memQueueFull()) {
         nextWriteBackEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextWriteBackEvent");
         return;
     }
 
@@ -715,20 +752,25 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
-    if (!nextSendRetryEvent.scheduled()) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(!nextSendRetryEvent.pending());
+    assert(!nextSendRetryEvent.scheduled());
+    assert(numRetriesReceived == 1);
+    schedule(nextSendRetryEvent, nextCycle());
 }
 
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    if (needsPush.count() == 0) {
-        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-                        "bit in needsPush. Rejecting the retry.\n", __func__);
-        peerPushEngine->recvRetryReject();
-        return;
-    }
+    assert(!nextSendRetryEvent.pending());
+    assert(needsPush.count() != 0);
+    // if (needsPush.count() == 0) {
+    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
+    //     peerPushEngine->recvRetryReject();
+    //     return;
+    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -807,6 +849,16 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
+        if (memQueueFull()) {
+            nextSendRetryEvent.sleep();
+            if (!pendingMemRetry()) {
+                assert(pendingEventQueue.empty());
+                requestMemRetry(1);
+            }
+            pendingEventQueue.push_back("nextSendRetryEvent");
+            return;
+        }
+
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
@@ -816,17 +868,12 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemQueueSpace(1)) {
-            enqueueMemReq(pkt);
-        } else {
-            requestMemRetry(1);
-        }
+        enqueueMemReq(pkt);
     }
 
     numRetriesReceived--;
-    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    assert(numRetriesReceived == 0);
+    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 563fa671b3..83ca6e5f14 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -124,6 +124,8 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    std::deque<std::string> pendingEventQueue;
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
@@ -136,7 +138,7 @@ class CoalesceEngine : public BaseMemEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
+    MemoryEvent nextSendRetryEvent;
     void processNextSendRetryEvent();
 
     struct CoalesceStats : public statistics::Group
@@ -159,6 +161,7 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
+    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
     virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5ab8db401c..c64ff003c4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,6 +43,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -121,8 +122,8 @@ PushEngine::deallocatePushSpace(int space)
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-            numPendingRetries++;
-            peerCoalesceEngine->recvPushRetry();
+            assert(!nextSendRetryEvent.scheduled());
+            schedule(nextSendRetryEvent, nextCycle());
         }
     }
 }
@@ -221,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                numPendingRetries++;
-                peerCoalesceEngine->recvPushRetry();
+                assert(!nextSendRetryEvent.scheduled());
+                schedule(nextSendRetryEvent, nextCycle());
             }
         }
     }
@@ -239,6 +240,14 @@ PushEngine::processNextAddrGenEvent()
     }
 }
 
+void
+PushEngine::processNextSendRetryEvent()
+{
+    assert(numPendingRetries == 0);
+    numPendingRetries++;
+    peerCoalesceEngine->recvPushRetry();
+}
+
 void
 PushEngine::recvMemRetry()
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a3a308554f..378cd1a487 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -128,6 +128,9 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);

From 26f18a493667f5b83e35d136abf09cc915ed80d6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:43:22 -0700
Subject: [PATCH 124/279] Adding DPRINTF for structure sizes.

---
 src/accl/graph/SConscript              |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  47 +++++---
 src/accl/graph/base/base_mem_engine.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 src/accl/graph/sega/push_engine.cc     |   5 +-
 src/accl/graph/sega/wl_engine.cc       | 151 +++++++++++++++----------
 src/accl/graph/sega/wl_engine.hh       |   8 +-
 7 files changed, 134 insertions(+), 85 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index f5f7e962af..7fd3591b2c 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-
+DebugFlag('SEGAStructureSize')
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index cb4c1d81bb..aa78aac8b5 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/BaseMemEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 
 namespace gem5
 {
@@ -37,7 +38,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
@@ -99,17 +100,22 @@ BaseMemEngine::processNextMemReqEvent()
 {
     if ((respQueueSize == 0) ||
         ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = outstandingMemReqQueue.front();
+        PacketPtr pkt = memQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
         DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
-        outstandingMemReqQueue.pop_front();
-
+        memQueue.pop_front();
+        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
+        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
         if (memRetryRequested &&
-            (outstandingMemReqQueue.size() <=
-            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            (memQueue.size() <=
+            (memQueueSize - memSpaceRequested))) {
             memRetryRequested = false;
             memSpaceRequested = 0;
             recvMemRetry();
@@ -117,7 +123,7 @@ BaseMemEngine::processNextMemReqEvent()
     }
 
     if ((!memPort.blocked()) &&
-        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -156,30 +162,35 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemQueueSpace(int space)
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
+        (memQueueSize == 0) ||
+        (memQueue.size() <= (memQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memQueueFull()
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize != 0) &&
-        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
+        (memQueueSize != 0) &&
+        (memQueue.size() == memQueueSize));
 }
 
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    outstandingMemReqQueue.push_back(pkt);
-
+    memQueue.push_back(pkt);
+    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
+    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
     if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
@@ -199,7 +210,7 @@ void
 BaseMemEngine::wakeUp()
 {
     assert(!nextMemReqEvent.scheduled());
-    if (!outstandingMemReqQueue.empty()) {
+    if (!memQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 64ef49ee1d..520970c5a0 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -68,12 +68,12 @@ class BaseMemEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int outstandingMemReqQueueSize;
+    int memQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> memQueue;
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4a0600e9c0..ea572ea749 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -862,7 +862,7 @@ CoalesceEngine::processNextSendRetryEvent()
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
-        // queueing in the outstandingMemReqQueue.
+        // queueing in the memQueue.
         // FIXME: Also do not send requests for cache lines that are already
         // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c64ff003c4..d745dabef6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -222,8 +222,9 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                assert(!nextSendRetryEvent.scheduled());
-                schedule(nextSendRetryEvent, nextCycle());
+                if (!nextSendRetryEvent.scheduled()) {
+                    schedule(nextSendRetryEvent, nextCycle());
+                }
             }
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 79bf046ba3..2d4ffc9cac 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -39,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -129,45 +130,68 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
-                "value: %u.\n", __func__, update_addr, update_value);
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
 
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
-                    __func__, update_addr);
-        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+    if ((registerFile.find(update_addr) == registerFile.end())) {
+        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                            "in registerFile.\n", __func__, update_addr);
+        if (registerFile.size() < registerFileSize) {
+            DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                            "registerFile.\n", __func__);
+            // TODO: It might be a good idea for WLEngine to act differently
+            // on cache rejects. As a first step the cache should not just
+            // return a boolean value. It should return an integer/enum
+            // to tell WLEngine why it rejected the read request. Their might
+            // be things that WLEngine can do to fix head of the line blocking.
             if (coalesceEngine->recvWLRead(update_addr)) {
-                onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
-                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
-                            update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                            "request to addr: %lu.\n", __func__, update_addr);
+                registerFile[update_addr] = update_value;
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
+                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                            ". updateQueue.size = %u.\n",
-                            __func__, updateQueue.size());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
                 respPort.checkRetryReq();
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
-                    update_addr, onTheFlyUpdateMap[update_addr]);
-        onTheFlyUpdateMap[update_addr] =
-                std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
-                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-        stats.onTheFlyCoalesce++;
+        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                __func__, update_addr, update_addr, registerFile[update_addr]);
+        registerFile[update_addr] =
+                std::min(update_value, registerFile[update_addr]);
+        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                    " registerFile. registerFile[%lu] = %u.\n", __func__,
+                    update_value, update_addr, registerFile[update_addr]);
+        stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                    "from updateQueue. updateQueue.size = %d. "
+                    "updateQueueSize = %d.\n", __func__, update_addr,
+                    update_value, updateQueue.size(), updateQueueSize);
         respPort.checkRetryReq();
     }
 
-    // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
+    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -175,14 +199,16 @@ WLEngine::processNextReadEvent()
 void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    assert(workListFile.size() <= registerFileSize);
 
-    addrWorkListMap[addr] = wl;
-    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
-                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
-                __func__, addr, wl.to_string());
-
-    assert(!addrWorkListMap.empty());
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -191,28 +217,31 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    for (auto &it : addrWorkListMap) {
+    for (auto &it : workListFile) {
         Addr addr = it.first;
-        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
-        uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
-                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
-                    "addrWorkListMap[%lu] = %s.\n", __func__,
-                                addr, onTheFlyUpdateMap[addr],
-                                addr, addrWorkListMap[addr].to_string());
+        assert(registerFile.find(addr) != registerFile.end());
+        uint32_t update_value = registerFile[addr];
+        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+                                        __func__, addr, registerFile[addr],
+                                        addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        addrWorkListMap[addr].tempProp =
-                    std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
-                    __func__, addr, addrWorkListMap[addr].to_string());
+        workListFile[addr].tempProp =
+                    std::min(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+                            __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        onTheFlyUpdateMap.erase(addr);
-        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
+        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
     }
-    addrWorkListMap.clear();
+    workListFile.clear();
 }
 
 bool
@@ -224,11 +253,19 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+
+
+    // delete the packet since it's not needed anymore.
     delete pkt;
-    assert(!updateQueue.empty());
+
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -241,7 +278,7 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
 
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
-    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies")
 {
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 27fc3efa7a..79fe60f6d0 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -74,10 +74,10 @@ class WLEngine : public BaseReduceEngine
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
-    int onTheFlyUpdateMapSize;
-    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
+    int registerFileSize;
+    std::unordered_map<Addr, uint32_t> registerFile;
 
-    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
+    std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
 
@@ -98,7 +98,7 @@ class WLEngine : public BaseReduceEngine
       WLEngine &wl;
 
       statistics::Scalar numReduce;
-      statistics::Scalar onTheFlyCoalesce;
+      statistics::Scalar registerFileCoalesce;
     };
 
     WorkListStats stats;

From 2cf0bbdf8c32ff5f62cbd2e852f56b5175407b90 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:44:06 -0700
Subject: [PATCH 125/279] Updating config script for sega.

---
 configs/accl/sega.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ffd74241e7..cf189733f0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=4,
+                                    resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1)
+                                    num_mshr_entry=8,
+                                    num_tgts_per_mshr=8,
+                                    outstanding_mem_req_queue_size=8)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
 
     def getRespPort(self):
         return self.wl_engine.resp_port

From 252cb705085471d4f267e8d8a40d32666cce3333 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:14:08 -0700
Subject: [PATCH 126/279] Adding more assertion for MSHR and fillQueue.

---
 configs/accl/sega.py                   | 12 ++++++------
 src/accl/graph/sega/coalesce_engine.cc |  3 +++
 src/accl/graph/sega/push_engine.cc     |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index cf189733f0..8fb3b75996 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -16,13 +16,13 @@ def __init__(self, base_edge_addr):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="1MiB",
-                                    num_mshr_entry=8,
-                                    num_tgts_per_mshr=8,
-                                    outstanding_mem_req_queue_size=8)
+                                    cache_size="128B",
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=4)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ea572ea749..8f56962a8c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -232,6 +232,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
+                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
@@ -251,6 +252,7 @@ CoalesceEngine::recvWLRead(Addr addr)
 
                     // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
+                    assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
@@ -737,6 +739,7 @@ CoalesceEngine::processNextWriteBackEvent()
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
+        assert(fillQueue.size() <= numLines);
     }
 
     writeBackQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d745dabef6..a41ca8a778 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             int free_space = pushReqQueueSize -
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        "free spaces.\n", __func__, free_space);
+                        " free spaces.\n", __func__, free_space);
             if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "

From f93dbc1e083e9dfa8d472d2d9e5d703f73719886 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:27:10 -0700
Subject: [PATCH 127/279] Adding debug flags for responseQueue size.

---
 src/accl/graph/sega/coalesce_engine.cc | 41 +++++++++++++++++++-------
 src/accl/graph/sega/wl_engine.hh       |  2 ++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f56962a8c..959bfa9743 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,6 +32,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -168,11 +169,18 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            cacheBlocks[block_index].items[wl_offset].to_string(),
-            responseQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
@@ -345,9 +353,12 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -536,10 +547,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
-                    "responseQueue. responseQueue.size = %u.\n"
-                    , __func__, block_index, wl_offset,
-                    responseQueue.size());
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(), 
+                        peerWLEngine->getRegisterFileSize());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 79fe60f6d0..5e8e5b25f3 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -114,6 +114,8 @@ class WLEngine : public BaseReduceEngine
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    int getRegisterFileSize() { return registerFileSize; }
 };
 
 }

From 744e4f3bab113366cb63be4d067ca2bd9876c81f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:33:53 -0700
Subject: [PATCH 128/279] Adding assertions to test the size of queues in
 coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 10 +++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 -
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 959bfa9743..753bfc988b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     writeBackQueue(numLines),
-    replaceQueue(numLines),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
@@ -320,6 +319,8 @@ CoalesceEngine::processNextMemoryReadEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextMemoryReadEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -460,6 +461,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     }
                     if (cacheBlocks[block_index].hasConflict) {
                         writeBackQueue.push_back(block_index);
+                        assert(writeBackQueue.size() <= numLines);
                         if ((!nextWriteBackEvent.pending()) &&
                             (!nextWriteBackEvent.scheduled())) {
                             schedule(nextWriteBackEvent, nextCycle());
@@ -683,6 +685,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
+        assert(writeBackQueue.size() <= numLines);
         DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
                 __func__, block_index, writeBackQueue.size());
     }
@@ -714,6 +717,8 @@ CoalesceEngine::processNextWriteBackEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextWriteBackEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -863,6 +868,7 @@ CoalesceEngine::processNextSendRetryEvent()
             }
             if (cacheBlocks[block_index].hasConflict) {
                 writeBackQueue.push_back(block_index);
+                assert(writeBackQueue.size() <= numLines);
                 if ((!writeBackQueue.empty()) &&
                     (!nextWriteBackEvent.pending()) &&
                     (!nextWriteBackEvent.scheduled())) {
@@ -878,6 +884,8 @@ CoalesceEngine::processNextSendRetryEvent()
                 requestMemRetry(1);
             }
             pendingEventQueue.push_back("nextSendRetryEvent");
+            // Maximum three MemoryEvent.
+            assert(pendingEventQueue.size() <= 3);
             return;
         }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 83ca6e5f14..cfa0a79102 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,7 +118,6 @@ class CoalesceEngine : public BaseMemEngine
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     InOutSet<int> writeBackQueue;
-    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);

From 9fb9943f77960ecdde9e710a9c62f5866ce4687a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 22:43:28 -0700
Subject: [PATCH 129/279] Checking the size of queues in PushEngine and
 WLEngine

---
 src/accl/graph/base/base_mem_engine.cc | 2 +-
 src/accl/graph/base/base_mem_engine.hh | 3 ++-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index aa78aac8b5..590307b2bc 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,10 +40,10 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     memPort(name() + ".mem_port", this),
     memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
-    respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    respQueueSize(params.resp_queue_size),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 520970c5a0..01c862d555 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -70,7 +70,6 @@ class BaseMemEngine : public ClockedObject
 
     int memQueueSize;
     int onTheFlyReqs;
-    int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> memQueue;
@@ -79,6 +78,8 @@ class BaseMemEngine : public ClockedObject
     void processNextMemReqEvent();
 
   protected:
+
+    int respQueueSize;
     const RequestorID _requestorId;
 
     size_t peerMemoryAtomSize;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a41ca8a778..cfebf8e5df 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -173,6 +173,7 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -263,6 +264,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
+    assert(memRespQueue.size() <= respQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());

From 2787e47256a1dcfa440cbd3bff0bf7f3ca50c7dd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 09:33:11 -0700
Subject: [PATCH 130/279] Making CoalesceEngine a BaseMemoryEngine.

---
 configs/accl/sega.py                      |  13 ++-
 src/accl/graph/sega/BaseMemoryEngine.py   |  42 ++++++++
 src/accl/graph/sega/CoalesceEngine.py     |  17 ++-
 src/accl/graph/sega/SConscript            |   3 +
 src/accl/graph/sega/base_memory_engine.cc | 122 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.hh |  99 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc    |  70 ++++---------
 src/accl/graph/sega/coalesce_engine.hh    |  14 +--
 8 files changed, 305 insertions(+), 75 deletions(-)
 create mode 100644 src/accl/graph/sega/BaseMemoryEngine.py
 create mode 100644 src/accl/graph/sega/base_memory_engine.cc
 create mode 100644 src/accl/graph/sega/base_memory_engine.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8fb3b75996..7577331f2b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,19 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=2,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=4,
-                                    resp_queue_size=8)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="128B",
+                                    cache_size="32B",
                                     num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    num_tgts_per_mshr=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
-                                on_the_fly_update_map_size=4)
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 7667a22c5a..536c3477ae 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,21 +27,16 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class CoalesceEngine(BaseMemEngine):
+class CoalesceEngine(BaseMemoryEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "")
-
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
-
-    num_mshr_entry = Param.Int(4, "")
-    num_tgts_per_mshr = Param.Int(20, "")
-
-    # Don't change. If changed. It will break functionality of coalesce.
-    resp_queue_size = 0
+    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
+    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
 
+    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 77e508f4ed..97a62d44a0 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,16 +27,19 @@
 
 Import('*')
 
+SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
+DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..e5e78f2c04
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    } else {
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..8fb8fde7e6
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+  protected:
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 753bfc988b..678cf0456e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,8 +38,8 @@
 namespace gem5
 {
 
-CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseMemEngine(params),
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -67,12 +67,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    sendMemFunctional(pkt);
-}
-
 void
 CoalesceEngine::startup()
 {
@@ -171,13 +165,13 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -257,7 +251,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
                     assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
@@ -310,16 +303,12 @@ void
 CoalesceEngine::processNextMemoryReadEvent()
 {
     assert(!nextMemoryReadEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         nextMemoryReadEvent.sleep();
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvent.
+        // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
         return;
     }
@@ -330,7 +319,7 @@ CoalesceEngine::processNextMemoryReadEvent()
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
-    enqueueMemReq(pkt);
+    memPort.sendPacket(pkt);
 
     fillQueue.pop_front();
 
@@ -367,11 +356,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
-// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!pendingEventQueue.empty());
+    if (pendingEventQueue.empty()) {
+        return;
+    }
+
     std::string front = pendingEventQueue.front();
 
     if (front == "nextMemoryReadEvent") {
@@ -387,7 +378,6 @@ CoalesceEngine::recvMemRetry()
     } else if (front == "nextSendRetryEvent") {
         assert(!nextSendRetryEvent.scheduled());
         assert(nextSendRetryEvent.pending());
-        breakPointFunction();
         schedule(nextSendRetryEvent, nextCycle());
         nextSendRetryEvent.wake();
     } else {
@@ -395,12 +385,10 @@ CoalesceEngine::recvMemRetry()
     }
 
     pendingEventQueue.pop_front();
-    if (!pendingEventQueue.empty()) {
-        requestMemRetry(1);
-    }
     return;
 }
 
+// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -552,13 +540,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
-                        responseQueue.size(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -708,14 +696,8 @@ void
 CoalesceEngine::processNextWriteBackEvent()
 {
     assert(!nextWriteBackEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         nextWriteBackEvent.sleep();
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
@@ -744,7 +726,7 @@ CoalesceEngine::processNextWriteBackEvent()
             DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
-            enqueueMemReq(write_pkt);
+            memPort.sendPacket(write_pkt);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -764,6 +746,10 @@ CoalesceEngine::processNextWriteBackEvent()
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
         assert(fillQueue.size() <= numLines);
+        if ((!nextMemoryReadEvent.pending()) &&
+            (!nextMemoryReadEvent.scheduled())){
+            schedule(nextMemoryReadEvent, nextCycle());
+        }
     }
 
     writeBackQueue.pop_front();
@@ -792,12 +778,6 @@ CoalesceEngine::processNextSendRetryEvent()
 {
     assert(!nextSendRetryEvent.pending());
     assert(needsPush.count() != 0);
-    // if (needsPush.count() == 0) {
-    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
-    //     peerPushEngine->recvRetryReject();
-    //     return;
-    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -877,12 +857,8 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
-        if (memQueueFull()) {
+        if (memPort.blocked()) {
             nextSendRetryEvent.sleep();
-            if (!pendingMemRetry()) {
-                assert(pendingEventQueue.empty());
-                requestMemRetry(1);
-            }
             pendingEventQueue.push_back("nextSendRetryEvent");
             // Maximum three MemoryEvent.
             assert(pendingEventQueue.size() <= 3);
@@ -898,7 +874,7 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        enqueueMemReq(pkt);
+        memPort.sendPacket(pkt);
     }
 
     numRetriesReceived--;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index cfa0a79102..a322379b05 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,7 +31,7 @@
 
 #include <bitset>
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -39,14 +39,12 @@
 
 #define MAX_BITVECTOR_SIZE (1 << 30)
 
-// TODO: Add parameters for size, memory atom size, type size,
-// length of items in the blocks.
 namespace gem5
 {
 
 class WLEngine;
 
-class CoalesceEngine : public BaseMemEngine
+class CoalesceEngine : public BaseMemoryEngine
 {
   private:
     class MemoryEvent : public EventFunctionWrapper
@@ -160,16 +158,14 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
-    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
-    virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(CoalesceEngine);
 
-    CoalesceEngine(const CoalesceEngineParams &params);
+    CoalesceEngine(const Params &params);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -178,9 +174,7 @@ class CoalesceEngine : public BaseMemEngine
 
     void recvPushRetry();
 
-    void recvFunctional(PacketPtr pkt);
-
-    virtual void startup();
+    virtual void startup() override;
 };
 
 }

From d437ddfb839a540af17a5c8b66118892883293fb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 13:06:22 -0700
Subject: [PATCH 131/279] Fixing cache mapping issue.

---
 src/accl/graph/SConscript                 |   3 +-
 src/accl/graph/sega/base_memory_engine.cc |  14 +++
 src/accl/graph/sega/base_memory_engine.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc    | 105 ++++++++++++----------
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 5 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7fd3591b2c..53c6411de6 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,4 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
+                    'BaseMemEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index e5e78f2c04..9db95d6bd6 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -55,6 +55,20 @@ BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+    // BaseMemoryEngine only supports one memory.
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
+                            "The range is %s interleaved.\n", __func__,
+                            peerMemoryRange.to_string(),
+                            peerMemoryRange.interleaved() ? "" : "not");
+}
+
 void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 8fb8fde7e6..efbfa5312d 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -69,6 +69,7 @@ class BaseMemoryEngine : public ClockedObject
     System* system;
     const RequestorID _requestorId;
 
+    AddrRange peerMemoryRange;
     MemPort memPort;
 
     size_t peerMemoryAtomSize;
@@ -92,6 +93,7 @@ class BaseMemoryEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    virtual void init() override;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 678cf0456e..21f048213a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -67,44 +67,48 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::startup()
-{
-    AddrRangeList vertex_ranges = getAddrRanges();
-
-    bool found = false;
-    Addr first_match_addr = 0;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(first_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        first_match_addr += peerMemoryAtomSize;
-    }
-
-    found = false;
-    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(second_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        second_match_addr += peerMemoryAtomSize;
-    }
-
-    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    memoryAddressOffset = first_match_addr;
-}
+// void
+// CoalesceEngine::startup()
+// {
+//     return;
+    // std::cout << "Hello" << std::endl;
+    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
+    //                                 __func__, peerMemoryRange.to_string());
+    // AddrRangeList vertex_ranges = getAddrRanges();
+
+    // bool found = false;
+    // Addr first_match_addr = 0;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(first_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     first_match_addr += peerMemoryAtomSize;
+    // }
+
+    // found = false;
+    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(second_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     second_match_addr += peerMemoryAtomSize;
+    // }
+
+    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    // memoryAddressOffset = first_match_addr;
+// }
 
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
@@ -117,7 +121,10 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
+                                __func__, addr, trimmed_addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -125,10 +132,10 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    int bit_index = atom_index * block_bits;
-    return bit_index;
+    return atom_index * block_bits;
 }
 
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
@@ -136,9 +143,8 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    return (block_addr + memoryAddressOffset);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
 bool
@@ -149,7 +155,8 @@ CoalesceEngine::recvWLRead(Addr addr)
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
@@ -507,7 +514,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    int block_index = (addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(addr);
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -591,7 +599,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a322379b05..28b204e198 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -91,8 +91,8 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    int nmpu;
-    Addr memoryAddressOffset;
+    // int nmpu;
+    // Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -174,7 +174,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     void recvPushRetry();
 
-    virtual void startup() override;
+    // virtual void startup() override;
 };
 
 }

From 4f431ae04a41c8d94c0daf8b1364ceb492e1fb36 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 20:51:48 -0700
Subject: [PATCH 132/279] Refactoring PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/sega/CoalesceEngine.py     |   6 +-
 src/accl/graph/sega/PushEngine.py         |  15 ++-
 src/accl/graph/sega/WLEngine.py           |  11 +-
 src/accl/graph/sega/base_memory_engine.hh |  20 ++-
 src/accl/graph/sega/coalesce_engine.hh    |  14 ---
 src/accl/graph/sega/push_engine.cc        | 143 +++++++++++-----------
 src/accl/graph/sega/push_engine.hh        |  17 ++-
 src/accl/graph/sega/wl_engine.cc          |   2 +-
 8 files changed, 117 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 536c3477ae..06c6f92750 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -36,7 +36,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
-    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index d3276799aa..447731219e 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,13 +27,20 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class PushEngine(BaseMemEngine):
+class PushEngine(BaseMemoryEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("")
-    push_req_queue_size = Param.Int(0, "")
+    base_edge_addr = Param.Addr("The base address for the "
+                                    "attached edge memory")
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index cab47fbe7b..98089328f4 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,11 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "")
-    update_queue_size = Param.Int(0, "")
-    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
+    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
+                                    "this WLEngine is connected to.")
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.") # 4 is arbitrary
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index efbfa5312d..5653ede698 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -43,7 +43,21 @@ namespace gem5
 
 class BaseMemoryEngine : public ClockedObject
 {
-  private:
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     class MemPort : public RequestPort
     {
       private:
@@ -65,13 +79,11 @@ class BaseMemoryEngine : public ClockedObject
         virtual void recvReqRetry();
     };
 
-  protected:
     System* system;
     const RequestorID _requestorId;
 
-    AddrRange peerMemoryRange;
     MemPort memPort;
-
+    AddrRange peerMemoryRange;
     size_t peerMemoryAtomSize;
 
     virtual void recvMemRetry() = 0;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 28b204e198..b8cac15f5c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,20 +47,6 @@ class WLEngine;
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
-    class MemoryEvent : public EventFunctionWrapper
-    {
-      private:
-        bool _pending;
-      public:
-        MemoryEvent(const std::function<void(void)> &callback,
-                    const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
-        {}
-        bool pending() { return _pending; }
-        void sleep() { _pending = true; }
-        void wake() { _pending = false; }
-    };
-
     struct Block
     {
         WorkListItem* items;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cfebf8e5df..d87462d7dd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,13 +35,15 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params):
-    BaseMemEngine(params),
+PushEngine::PushEngine(const Params &params):
+    BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
-    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    onTheFlyMemReqs(0),
+    memRespQueueSize(params.resp_queue_size),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
@@ -52,10 +54,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
-    } else if (if_name == "mem_port") {
-        return BaseMemEngine::getPort(if_name, idx);
     } else {
-        return SimObject::getPort(if_name, idx);
+        return BaseMemoryEngine::getPort(if_name, idx);
     }
 }
 
@@ -98,9 +98,9 @@ PushEngine::ReqPort::recvReqRetry()
     if (!_blocked) {
         blockedPacket = nullptr;
         DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                    __func__, _blocked ? "true" : "false",
-                    (blockedPacket == nullptr) ? "true" : "false");
+                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                __func__, _blocked ? "true" : "false",
+                (blockedPacket == nullptr) ? "true" : "false");
     }
 }
 
@@ -149,14 +149,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -178,67 +173,68 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
                             __func__, pushReqQueue.size());
 
     numTotalRetries--;
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextAddrGenEvent()
+PushEngine::processNextMemoryReadEvent()
 {
-    Addr aligned_addr, offset;
-    int num_edges;
-
-    PushPacketInfoGen &curr_info = pushReqQueue.front();
-    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(PushEngine, "%s: Current packet information generated by "
-                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
-                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
-
-    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-    reqOffsetMap[pkt->req] = offset;
-    reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = curr_info.value();
-
-    enqueueMemReq(pkt);
-
-    if (curr_info.done()) {
-        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
-        pushReqQueue.pop_front();
-        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                    "pushReqQueue.size() = %u.\n",
-                    __func__, pushReqQueue.size());
-        if (numTotalRetries > 0) {
-            int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        " free spaces.\n", __func__, free_space);
-            if ((free_space >= numElementsPerLine) &&
-                (numPendingRetries == 0)) {
-                DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-                if (!nextSendRetryEvent.scheduled()) {
-                    schedule(nextSendRetryEvent, nextCycle());
-                }
-            }
-        }
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
     }
 
-    if (memQueueFull()) {
-        if (!pushReqQueue.empty()) {
-            requestMemRetry(1);
+    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+        Addr aligned_addr, offset;
+        int num_edges;
+
+        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        reqOffsetMap[pkt->req] = offset;
+        reqNumEdgeMap[pkt->req] = num_edges;
+        reqValueMap[pkt->req] = curr_info.value();
+
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs++;
+
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            pushReqQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
+                        "pushReqQueue.size() = %u.\n",
+                        __func__, pushReqQueue.size());
+            if (numTotalRetries > 0) {
+                int free_space = pushReqQueueSize -
+                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                            " free spaces.\n", __func__, free_space);
+                if ((free_space >= numElementsPerLine) &&
+                    (numPendingRetries == 0)) {
+                    DPRINTF(PushEngine, "%s: Sent a push retry to "
+                                "peerCoalesceEngine.\n", __func__);
+                    if (!nextSendRetryEvent.scheduled()) {
+                        schedule(nextSendRetryEvent, nextCycle());
+                    }
+                }
+            }
         }
-        return;
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
+        // schedule(nextMemoryReadEvent, nextCycle());
+    // }
+    if (!pushReqQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -253,9 +249,11 @@ PushEngine::processNextSendRetryEvent()
 void
 PushEngine::recvMemRetry()
 {
-    assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
-    schedule(nextAddrGenEvent, nextCycle());
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
 }
 
 bool
@@ -264,7 +262,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
-    assert(memRespQueue.size() <= respQueueSize);
+    onTheFlyMemReqs--;
+    assert(memRespQueue.size() <= memRespQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 378cd1a487..9b182e2251 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
@@ -39,7 +39,7 @@ namespace gem5
 
 class CoalesceEngine;
 
-class PushEngine : public BaseMemEngine
+class PushEngine : public BaseMemoryEngine
 {
   private:
     class PushPacketInfoGen {
@@ -115,15 +115,14 @@ class PushEngine : public BaseMemEngine
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // Since the push engine can process incoming packets faster than
-    // memory can send those packets, the size of this queue will
-    // always be limited by the b/w of the memory.
+    int onTheFlyMemReqs;
+    int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
-    EventFunctionWrapper nextAddrGenEvent;
-    void processNextAddrGenEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
@@ -145,13 +144,12 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
+    PushEngine(const Params &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -169,7 +167,6 @@ class PushEngine : public BaseMemEngine
 
     int getNumRetries() { return numTotalRetries; }
 
-    void recvRetryReject() { numPendingRetries--; }
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2d4ffc9cac..12f4548aa2 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    registerFileSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)

From 4f1c302be299295e5dab8ef66a3663d924331fe0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:01:42 -0700
Subject: [PATCH 133/279] Refactored PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/SConscript                 |   4 +-
 src/accl/graph/base/BaseMemEngine.py      |  47 ---
 src/accl/graph/base/SConscript            |   3 -
 src/accl/graph/base/base_mem_engine.cc    | 225 --------------
 src/accl/graph/base/base_mem_engine.hh    | 125 --------
 src/accl/graph/sega/base_memory_engine.cc |   4 +
 src/accl/graph/sega/base_memory_engine.hh |   7 +-
 src/accl/graph/sega/coalesce_engine.cc    | 362 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh    |   9 +-
 9 files changed, 275 insertions(+), 511 deletions(-)
 delete mode 100644 src/accl/graph/base/BaseMemEngine.py
 delete mode 100644 src/accl/graph/base/base_mem_engine.cc
 delete mode 100644 src/accl/graph/base/base_mem_engine.hh

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 53c6411de6..5dffd1a396 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
-                    'BaseMemEngine', 'BaseMemoryEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
deleted file mode 100644
index 2ecb6659d8..0000000000
--- a/src/accl/graph/base/BaseMemEngine.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseMemEngine(ClockedObject):
-    abstract = True
-    type = 'BaseMemEngine'
-    cxx_header = "accl/graph/base/base_mem_engine.hh"
-    cxx_class = 'gem5::BaseMemEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
-
-    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
-                                    "which memory requests are queued.")
-
-    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
-                                    "memory.")
-
-    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 45877a12ca..0e43d1aed8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,6 @@
 
 Import('*')
 
-SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
-DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
deleted file mode 100644
index 590307b2bc..0000000000
--- a/src/accl/graph/base/base_mem_engine.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_mem_engine.hh"
-
-#include "debug/BaseMemEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-
-namespace gem5
-{
-
-BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".mem_port", this),
-    memQueueSize(params.outstanding_mem_req_queue_size),
-    onTheFlyReqs(0),
-    memRetryRequested(false),
-    memSpaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    respQueueSize(params.resp_queue_size),
-    _requestorId(system->getRequestorId(this)),
-    peerMemoryAtomSize(params.attached_memory_atom_size)
-{}
-
-BaseMemEngine::~BaseMemEngine()
-{}
-
-Port&
-BaseMemEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->recvTimingResp(pkt);
-}
-
-void
-BaseMemEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-
-    owner->wakeUp();
-}
-
-void
-BaseMemEngine::processNextMemReqEvent()
-{
-    if ((respQueueSize == 0) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = memQueue.front();
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
-                    "pkt->addr: %lu, pkt->size: %lu.\n",
-                    __func__, pkt->getAddr(), pkt->getSize());
-        memQueue.pop_front();
-        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        if (memRetryRequested &&
-            (memQueue.size() <=
-            (memQueueSize - memSpaceRequested))) {
-            memRetryRequested = false;
-            memSpaceRequested = 0;
-            recvMemRetry();
-        }
-    }
-
-    if ((!memPort.blocked()) &&
-        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-PacketPtr
-BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-bool
-BaseMemEngine::allocateMemQueueSpace(int space)
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize == 0) ||
-        (memQueue.size() <= (memQueueSize - space))
-        );
-}
-
-bool
-BaseMemEngine::memQueueFull()
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize != 0) &&
-        (memQueue.size() == memQueueSize));
-}
-
-void
-BaseMemEngine::enqueueMemReq(PacketPtr pkt)
-{
-    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    memQueue.push_back(pkt);
-    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-void
-BaseMemEngine::requestMemRetry(int space) {
-    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
-            "You should not request another alarm without the first one being"
-            "responded to.\n");
-    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memRetryRequested = true;
-    memSpaceRequested = space;
-}
-
-void
-BaseMemEngine::wakeUp()
-{
-    assert(!nextMemReqEvent.scheduled());
-    if (!memQueue.empty()) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-bool
-BaseMemEngine::recvTimingResp(PacketPtr pkt)
-{
-    onTheFlyReqs--;
-    return handleMemResp(pkt);
-}
-
-}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
deleted file mode 100644
index 01c862d555..0000000000
--- a/src/accl/graph/base/base_mem_engine.hh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseMemEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseMemEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseMemEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseMemEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    int memQueueSize;
-    int onTheFlyReqs;
-    bool memRetryRequested;
-    int memSpaceRequested;
-    std::deque<PacketPtr> memQueue;
-
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
-  protected:
-
-    int respQueueSize;
-    const RequestorID _requestorId;
-
-    size_t peerMemoryAtomSize;
-
-    bool allocateMemQueueSpace(int space);
-    bool memQueueFull();
-
-    bool pendingMemRetry() { return memRetryRequested; }
-    void requestMemRetry(int space);
-
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    void enqueueMemReq(PacketPtr pkt);
-
-    virtual int respBuffSize() = 0;
-    virtual void recvMemRetry() = 0;
-    virtual bool handleMemResp(PacketPtr pkt) = 0;
-
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-
-  public:
-    PARAMS(BaseMemEngine);
-
-    BaseMemEngine(const BaseMemEngineParams &params);
-    ~BaseMemEngine();
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
-    RequestorID requestorId() { return _requestorId; }
-
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    bool recvTimingResp(PacketPtr pkt);
-    void recvFunctional(PacketPtr pkt);
-
-    void wakeUp();
-
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9db95d6bd6..c60d189e0f 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -77,7 +77,11 @@ BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
+                                            __func__, blockedPacket->print());
     } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
+                                                __func__, pkt->print());
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 5653ede698..f336edcbf1 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -48,14 +48,19 @@ class BaseMemoryEngine : public ClockedObject
     {
       private:
         bool _pending;
+        int _prevState;
+
       public:
         MemoryEvent(const std::function<void(void)> &callback,
                     const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
         {}
         bool pending() { return _pending; }
         void sleep() { _pending = true; }
         void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
     };
 
     class MemPort : public RequestPort
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21f048213a..daaed28f1c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 
+#include <bitset>
+
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
@@ -53,7 +55,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
+    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -317,6 +319,10 @@ CoalesceEngine::processNextMemoryReadEvent()
         pendingEventQueue.push_back("nextMemoryReadEvent");
         // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -366,11 +372,14 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
     if (pendingEventQueue.empty()) {
+        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
         return;
     }
 
     std::string front = pendingEventQueue.front();
+    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
 
     if (front == "nextMemoryReadEvent") {
         assert(!nextMemoryReadEvent.scheduled());
@@ -382,11 +391,11 @@ CoalesceEngine::recvMemRetry()
         assert(nextWriteBackEvent.pending());
         schedule(nextWriteBackEvent, nextCycle());
         nextWriteBackEvent.wake();
-    } else if (front == "nextSendRetryEvent") {
-        assert(!nextSendRetryEvent.scheduled());
-        assert(nextSendRetryEvent.pending());
-        schedule(nextSendRetryEvent, nextCycle());
-        nextSendRetryEvent.wake();
+    } else if (front == "nextRecvPushRetryEvent") {
+        assert(!nextRecvPushRetryEvent.scheduled());
+        assert(nextRecvPushRetryEvent.pending());
+        schedule(nextRecvPushRetryEvent, nextCycle());
+        nextRecvPushRetryEvent.wake();
     } else {
         panic("EVENT IS NOT RECOGNIZED.\n");
     }
@@ -642,14 +651,16 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                    "apply process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
-                    "needed.\n", __func__, block_index);
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
+                    "Therefore, no apply needed.\n", __func__, block_index);
     } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
+                                                    __func__, block_index);
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             uint32_t new_prop = std::min(
@@ -683,8 +694,9 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
         assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
-                __func__, block_index, writeBackQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+                            "writeBackQueue.size = %u.\n", __func__,
+                                block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
@@ -710,6 +722,10 @@ CoalesceEngine::processNextWriteBackEvent()
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -774,121 +790,259 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextSendRetryEvent.pending());
-    assert(!nextSendRetryEvent.scheduled());
+    assert(!nextRecvPushRetryEvent.pending());
+    assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextSendRetryEvent, nextCycle());
+    schedule(nextRecvPushRetryEvent, nextCycle());
 }
 
-void
-CoalesceEngine::processNextSendRetryEvent()
+// void
+// CoalesceEngine::processNextRecvPushRetryEvent()
+// {
+//     assert(!nextRecvPushRetryEvent.pending());
+//     assert(needsPush.count() != 0);
+
+//     Addr block_addr = 0;
+//     int block_index = 0;
+//     int it = 0;
+//     uint32_t slice = 0;
+//     bool hit_in_cache = false;
+
+//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             slice <<= 1;
+//             slice |= needsPush[it + i];
+//         }
+//         if (slice) {
+//             block_addr = getBlockAddrFromBitIndex(it);
+//             block_index = getBlockIndex(block_addr);
+//             if ((cacheBlocks[block_index].addr == block_addr) &&
+//                 (cacheBlocks[block_index].valid)) {
+//                 if (cacheBlocks[block_index].busyMask == 0) {
+//                     hit_in_cache = true;
+//                     break;
+//                 }
+//             } else {
+//                 hit_in_cache = false;
+//                 break;
+//             }
+//         }
+//     }
+
+//     assert(it < MAX_BITVECTOR_SIZE);
+//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+//         currentBitSliceIndex = 0;
+//     } else {
+//         currentBitSliceIndex = it + numElementsPerLine;
+//     }
+
+//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+//                         "in needsPush.\n", __func__, slice, it);
+
+//     if (hit_in_cache) {
+//         int push_needed = 0;
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             // TODO: Make this more programmable
+//             uint32_t new_prop = std::min(
+//                                 cacheBlocks[block_index].items[i].prop,
+//                                 cacheBlocks[block_index].items[i].tempProp);
+//             cacheBlocks[block_index].items[i].tempProp = new_prop;
+//             cacheBlocks[block_index].items[i].prop = new_prop;
+//             if (needsPush[it + i] == 1) {
+//                 peerPushEngine->recvWLItemRetry(
+//                     cacheBlocks[block_index].items[i]);
+//             }
+//             push_needed +=  needsPush[it + i];
+//             needsPush[it + i] = 0;
+//         }
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         if (applyQueue.find(block_index)) {
+//             applyQueue.erase(block_index);
+//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+//                 deschedule(nextApplyEvent);
+//             }
+//             if (cacheBlocks[block_index].hasConflict) {
+//                 writeBackQueue.push_back(block_index);
+//                 assert(writeBackQueue.size() <= numLines);
+//                 if ((!writeBackQueue.empty()) &&
+//                     (!nextWriteBackEvent.pending()) &&
+//                     (!nextWriteBackEvent.scheduled())) {
+//                     schedule(nextWriteBackEvent, nextCycle());
+//                 }
+//             }
+//         }
+//     } else {
+//         if (memPort.blocked()) {
+//             nextRecvPushRetryEvent.sleep();
+//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
+//             // Maximum three MemoryEvent.
+//             assert(pendingEventQueue.size() <= 3);
+//             return;
+//         }
+
+//         // FIXME: Fix the retry mechanism between memory and cache to
+//         // handle memory retries correctly. This probably requires scheduling
+//         // an event for sending the retry. For now we're enabling infinite
+//         // queueing in the memQueue.
+//         // FIXME: Also do not send requests for cache lines that are already
+//         // read but await data. Just set a flag or sth.
+//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+//         SenderState* sender_state = new SenderState(true);
+//         pkt->pushSenderState(sender_state);
+//         memPort.sendPacket(pkt);
+//     }
+
+//     numRetriesReceived--;
+//     assert(numRetriesReceived == 0);
+//     assert(!nextRecvPushRetryEvent.scheduled());
+// }
+
+std::tuple<bool, int>
+CoalesceEngine::getOptimalBitVectorSlice()
 {
-    assert(!nextSendRetryEvent.pending());
-    assert(needsPush.count() != 0);
+    bool hit_in_cache;
+    int slice_base = -1;
 
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    Addr block_addr = 0;
-    int block_index = 0;
-    int it = 0;
-    uint32_t slice = 0;
-    bool hit_in_cache = false;
-
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+    int score = 0;
+    uint32_t current_popcount = 0;
+    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        int current_score = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
-            slice <<= 1;
-            slice |= needsPush[it + i];
+            current_popcount += needsPush[it + i];
         }
-        if (slice) {
-            block_addr = getBlockAddrFromBitIndex(it);
-            block_index = getBlockIndex(block_addr);
-            if ((cacheBlocks[block_index].addr == block_addr) &&
-                (cacheBlocks[block_index].valid)) {
-                if (cacheBlocks[block_index].busyMask == 0) {
-                    hit_in_cache = true;
-                    break;
-                }
-            } else {
+        if (current_popcount == 0) {
+            continue;
+        }
+        current_score += current_popcount;
+        Addr addr = getBlockAddrFromBitIndex(it);
+        int block_index = getBlockIndex(addr);
+        if ((cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].busyMask == 0)) {
+            current_score += numElementsPerLine * 2;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
+                hit_in_cache = true;
+            }
+        } else if (!((cacheBlocks[block_index].addr == addr) &&
+                    (cacheBlocks[block_index].allocated))) {
+            score += numElementsPerLine;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
                 hit_in_cache = false;
-                break;
             }
         }
     }
 
-    assert(it < MAX_BITVECTOR_SIZE);
-    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-        currentBitSliceIndex = 0;
-    } else {
-        currentBitSliceIndex = it + numElementsPerLine;
-    }
+    return std::make_tuple(hit_in_cache, slice_base);
+}
+
+void
+CoalesceEngine::processNextRecvPushRetryEvent()
+{
+    bool hit_in_cache;
+    int slice_base;
+    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
 
-    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-                        "in needsPush.\n", __func__, slice, it);
+    if (slice_base != -1) {
+        Addr addr = getBlockAddrFromBitIndex(slice_base);
+        int block_index = getBlockIndex(addr);
+        if (hit_in_cache) {
+            assert(cacheBlocks[block_index].valid);
+            assert(cacheBlocks[block_index].busyMask == 0);
+
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+                                        "its MemRetry.\n", __func__);
+                recvMemRetry();
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-    if (hit_in_cache) {
-        int push_needed = 0;
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
+            int push_needed = 0;
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                // TODO: Make this more programmable
+                uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(
-                    cacheBlocks[block_index].items[i]);
-            }
-            push_needed +=  needsPush[it + i];
-            needsPush[it + i] = 0;
-        }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                deschedule(nextApplyEvent);
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                if (needsPush[slice_base + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i]);
+                }
+                push_needed +=  needsPush[slice_base + i];
+                needsPush[slice_base + i] = 0;
             }
-            if (cacheBlocks[block_index].hasConflict) {
-                writeBackQueue.push_back(block_index);
-                assert(writeBackQueue.size() <= numLines);
-                if ((!writeBackQueue.empty()) &&
-                    (!nextWriteBackEvent.pending()) &&
-                    (!nextWriteBackEvent.scheduled())) {
-                    schedule(nextWriteBackEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            if (applyQueue.find(block_index)) {
+                applyQueue.erase(block_index);
+                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                    deschedule(nextApplyEvent);
+                }
+                if (cacheBlocks[block_index].hasConflict) {
+                    writeBackQueue.push_back(block_index);
+                    assert(writeBackQueue.size() <= numLines);
+                    if ((!nextWriteBackEvent.pending()) &&
+                        (!nextWriteBackEvent.scheduled())) {
+                        schedule(nextWriteBackEvent, nextCycle());
+                    }
                 }
             }
-        }
-    } else {
-        if (memPort.blocked()) {
-            nextSendRetryEvent.sleep();
-            pendingEventQueue.push_back("nextSendRetryEvent");
-            // Maximum three MemoryEvent.
-            assert(pendingEventQueue.size() <= 3);
-            return;
-        }
+        } else {
+            if (memPort.blocked()) {
+                assert(nextRecvPushRetryEvent.getPrevState() != -1);
+                nextRecvPushRetryEvent.setPrevState(-1);
+                nextRecvPushRetryEvent.sleep();
+                pendingEventQueue.push_back("nextRecvPushRetryEvent");
+                assert(pendingEventQueue.size() <= 3);
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+                                        "and has been pushed to pendingEventQueue."
+                                        " pendingEventQueue.size = %d.\n",
+                                        __func__, pendingEventQueue.size());
+                return;
+            }
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-        // FIXME: Fix the retry mechanism between memory and cache to
-        // handle memory retries correctly. This probably requires scheduling
-        // an event for sending the retry. For now we're enabling infinite
-        // queueing in the memQueue.
-        // FIXME: Also do not send requests for cache lines that are already
-        // read but await data. Just set a flag or sth.
-        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-        SenderState* sender_state = new SenderState(true);
-        pkt->pushSenderState(sender_state);
-        memPort.sendPacket(pkt);
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            SenderState* sender_state = new SenderState(true);
+            pkt->pushSenderState(sender_state);
+            memPort.sendPacket(pkt);
+            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
+            // It does not have to read this address anymore. It can simply set
+            // a flag to true (maybe not even needed just look if the cache has a
+            // line allocated for it in the cacheBlocks).
+        }
+        numRetriesReceived--;
+        assert(numRetriesReceived == 0);
+    }
+    if (numRetriesReceived > 0) {
+        schedule(nextRecvPushRetryEvent, nextCycle());
     }
-
-    numRetriesReceived--;
-    assert(numRetriesReceived == 0);
-    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b8cac15f5c..356fee0107 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<bool, int> getOptimalBitVectorSlice();
 
     std::deque<std::string> pendingEventQueue;
 
@@ -121,8 +122,8 @@ class CoalesceEngine : public BaseMemoryEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    MemoryEvent nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -145,8 +146,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceStats stats;
 
   protected:
-    virtual void recvMemRetry();
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
 
   public:
     PARAMS(CoalesceEngine);

From 3642de0bd3e091dbad402a21f15ea82210cd011d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:49:11 -0700
Subject: [PATCH 134/279] Making bit vector smaller and choosing slices faster.

---
 src/accl/graph/sega/coalesce_engine.cc | 7 ++++++-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index daaed28f1c..f86d6877ad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -914,9 +914,10 @@ CoalesceEngine::getOptimalBitVectorSlice()
     int slice_base = -1;
 
     int score = 0;
-    uint32_t current_popcount = 0;
+    int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         int current_score = 0;
+        uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
@@ -934,6 +935,9 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = true;
+                if (score == max_score_possible) {
+                    break;
+                }
             }
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
@@ -942,6 +946,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = false;
+                assert(score < max_score_possible);
             }
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 356fee0107..f6ed4843fa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -37,7 +37,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 30)
+#define MAX_BITVECTOR_SIZE (1 << 28)
 
 namespace gem5
 {

From 94da4600d17718aac026304d97728906ad76ccc7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 28 Jul 2022 06:36:15 -0700
Subject: [PATCH 135/279] Merging all memory interactions into one event.

---
 src/accl/graph/sega/coalesce_engine.cc | 559 +++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  24 +-
 2 files changed, 255 insertions(+), 328 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f86d6877ad..4d7107274b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,17 +45,15 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    currentBitSliceIndex(0),
-    numRetriesReceived(0),
-    applyQueue(numLines),
-    writeBackQueue(numLines),
-    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
+    numRetriesReceived(0), applyQueue(numLines),
+    // writeBackQueue(numLines),
+    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
+    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
+    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,49 +67,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-// void
-// CoalesceEngine::startup()
-// {
-//     return;
-    // std::cout << "Hello" << std::endl;
-    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
-    //                                 __func__, peerMemoryRange.to_string());
-    // AddrRangeList vertex_ranges = getAddrRanges();
-
-    // bool found = false;
-    // Addr first_match_addr = 0;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(first_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     first_match_addr += peerMemoryAtomSize;
-    // }
-
-    // found = false;
-    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(second_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     second_match_addr += peerMemoryAtomSize;
-    // }
-
-    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    // memoryAddressOffset = first_match_addr;
-// }
-
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -260,15 +215,20 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    fillQueue.push_back(block_index);
-                    assert(fillQueue.size() <= numLines);
+                    // fillQueue.push_back(block_index);
+                    // assert(fillQueue.size() <= numLines);
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
                     //         __func__, fillQueue.size());
-                    if ((!nextMemoryReadEvent.pending()) &&
-                        (!nextMemoryReadEvent.scheduled())) {
-                        schedule(nextMemoryReadEvent, nextCycle());
+                    // if ((!nextMemoryReadEvent.pending()) &&
+                    //     (!nextMemoryReadEvent.scheduled())) {
+                    //     schedule(nextMemoryReadEvent, nextCycle());
+                    // }
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -309,24 +269,24 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextMemoryReadEvent()
+CoalesceEngine::processNextMemoryReadEvent(int block_index)
 {
-    assert(!nextMemoryReadEvent.pending());
-    if (memPort.blocked()) {
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        nextMemoryReadEvent.sleep();
-        pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvents.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextMemoryReadEvent.pending());
+    // if (memPort.blocked()) {
+    //     // TODO: Implement interface where events of the CoalesceEngine are
+    //     // pushed to a fifo to be scheduled later.
+    //     nextMemoryReadEvent.sleep();
+    //     pendingEventQueue.push_back("nextMemoryReadEvent");
+    //     // Maximum three MemoryEvents.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = fillQueue.front();
+    // int block_index = fillQueue.front();
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -334,13 +294,11 @@ CoalesceEngine::processNextMemoryReadEvent()
 
     memPort.sendPacket(pkt);
 
-    fillQueue.pop_front();
+    // fillQueue.pop_front();
 
-    if (!fillQueue.empty()) {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(!nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-    }
+    // if (!fillQueue.empty()) {
+    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
+    // }
 }
 
 // TODO: For loop to empty the entire responseQueue.
@@ -370,38 +328,70 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::recvMemRetry()
+CoalesceEngine::processNextMemoryEvent()
 {
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    if (pendingEventQueue.empty()) {
-        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
         return;
     }
 
-    std::string front = pendingEventQueue.front();
-    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    if (front == "nextMemoryReadEvent") {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-        nextMemoryReadEvent.wake();
-    } else if (front == "nextWriteBackEvent") {
-        assert(!nextWriteBackEvent.scheduled());
-        assert(nextWriteBackEvent.pending());
-        schedule(nextWriteBackEvent, nextCycle());
-        nextWriteBackEvent.wake();
-    } else if (front == "nextRecvPushRetryEvent") {
-        assert(!nextRecvPushRetryEvent.scheduled());
-        assert(nextRecvPushRetryEvent.pending());
-        schedule(nextRecvPushRetryEvent, nextCycle());
-        nextRecvPushRetryEvent.wake();
-    } else {
-        panic("EVENT IS NOT RECOGNIZED.\n");
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
+}
 
-    pendingEventQueue.pop_front();
-    return;
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+    // if (pendingEventQueue.empty()) {
+    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+    //     return;
+    // }
+
+    // std::string front = pendingEventQueue.front();
+    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
+
+    // if (front == "nextMemoryReadEvent") {
+    //     assert(!nextMemoryReadEvent.scheduled());
+    //     assert(nextMemoryReadEvent.pending());
+    //     schedule(nextMemoryReadEvent, nextCycle());
+    //     nextMemoryReadEvent.wake();
+    // } else if (front == "nextWriteBackEvent") {
+    //     assert(!nextWriteBackEvent.scheduled());
+    //     assert(nextWriteBackEvent.pending());
+    //     schedule(nextWriteBackEvent, nextCycle());
+    //     nextWriteBackEvent.wake();
+    // } else if (front == "nextRecvPushRetryEvent") {
+    //     assert(!nextRecvPushRetryEvent.scheduled());
+    //     assert(nextRecvPushRetryEvent.pending());
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    //     nextRecvPushRetryEvent.wake();
+    // } else {
+    //     panic("EVENT IS NOT RECOGNIZED.\n");
+    // }
+
+    // pendingEventQueue.pop_front();
+    // return;
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
 }
 
 // FIXME: Fix this function.
@@ -464,12 +454,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        writeBackQueue.push_back(block_index);
-                        assert(writeBackQueue.size() <= numLines);
-                        if ((!nextWriteBackEvent.pending()) &&
-                            (!nextWriteBackEvent.scheduled())) {
-                            schedule(nextWriteBackEvent, nextCycle());
-                        }
+                        // writeBackQueue.push_back(block_index);
+                        // assert(writeBackQueue.size() <= numLines);
+                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                        // if ((!nextWriteBackEvent.pending()) &&
+                        //     (!nextWriteBackEvent.scheduled())) {
+                        //     schedule(nextWriteBackEvent, nextCycle());
+                        // }
+                        // if ((!nextMemoryEvent.pending()) &&
+                        //     (!nextMemoryEvent.scheduled())) {
+                        //     schedule(nextMemoryEvent, nextCycle());
+                        // }
                     }
                 }
             } else {
@@ -528,9 +523,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
-    assert((cacheBlocks[block_index].allocated) && // allocated cache block
-            (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
+    //         (!cacheBlocks[block_index].valid) &&    // valid is false
+    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    assert(cacheBlocks[block_index].allocated);
+    assert(!cacheBlocks[block_index].valid);
+    assert(MSHR.find(block_index) != MSHR.end());
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -691,22 +689,21 @@ CoalesceEngine::processNextApplyEvent()
     }
 
     // TODO: This is where eviction policy goes
-    if (cacheBlocks[block_index].hasConflict){
-        writeBackQueue.push_back(block_index);
-        assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-                            "writeBackQueue.size = %u.\n", __func__,
-                                block_index, writeBackQueue.size());
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (cacheBlocks[block_index].busyMask == 0)) {
+        // writeBackQueue.push_back(block_index);
+        // assert(writeBackQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+        //                     "writeBackQueue.size = %u.\n", __func__,
+        //                         block_index, writeBackQueue.size());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 
     applyQueue.pop_front();
-
-    if ((!writeBackQueue.empty()) &&
-        (!nextWriteBackEvent.pending()) &&
-        (!nextWriteBackEvent.scheduled())) {
-        schedule(nextWriteBackEvent, nextCycle());
-    }
-
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
@@ -714,22 +711,22 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent()
+CoalesceEngine::processNextWriteBackEvent(int block_index)
 {
-    assert(!nextWriteBackEvent.pending());
-    if (memPort.blocked()) {
-        nextWriteBackEvent.sleep();
-        pendingEventQueue.push_back("nextWriteBackEvent");
-        // Maximum three MemoryEvent.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextWriteBackEvent.pending());
+    // if (memPort.blocked()) {
+    //     nextWriteBackEvent.sleep();
+    //     pendingEventQueue.push_back("nextWriteBackEvent");
+    //     // Maximum three MemoryEvent.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = writeBackQueue.front();
+    // int block_index = writeBackQueue.front();
 
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
@@ -769,21 +766,35 @@ CoalesceEngine::processNextWriteBackEvent()
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        fillQueue.push_back(block_index);
-        assert(fillQueue.size() <= numLines);
-        if ((!nextMemoryReadEvent.pending()) &&
-            (!nextMemoryReadEvent.scheduled())){
-            schedule(nextMemoryReadEvent, nextCycle());
-        }
+        // fillQueue.push_back(block_index);
+        // assert(fillQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
+        // if ((!nextMemoryReadEvent.pending()) &&
+        //     (!nextMemoryReadEvent.scheduled())){
+        //     schedule(nextMemoryReadEvent, nextCycle());
+        // }
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 
-    writeBackQueue.pop_front();
-
-    if (!writeBackQueue.empty()) {
-        assert(!nextWriteBackEvent.pending());
-        assert(!nextWriteBackEvent.scheduled());
-        schedule(nextWriteBackEvent, nextCycle());
-    }
+    // writeBackQueue.pop_front();
+    // assert(writeBackQueue.size() <= numLines);
+    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
+    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
+    //                 __func__, block_index, writeBackQueue.size(), numLines);
+
+    // if (!writeBackQueue.empty()) {
+        // assert(!nextWriteBackEvent.pending());
+        // assert(!nextWriteBackEvent.scheduled());
+        // schedule(nextWriteBackEvent, nextCycle());
+        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
+    // }
 }
 
 void
@@ -793,130 +804,28 @@ CoalesceEngine::recvPushRetry()
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextRecvPushRetryEvent.pending());
-    assert(!nextRecvPushRetryEvent.scheduled());
+    // assert(!nextRecvPushRetryEvent.pending());
+    // assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextRecvPushRetryEvent, nextCycle());
+    // schedule(nextRecvPushRetryEvent, nextCycle());
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
 }
 
-// void
-// CoalesceEngine::processNextRecvPushRetryEvent()
-// {
-//     assert(!nextRecvPushRetryEvent.pending());
-//     assert(needsPush.count() != 0);
-
-//     Addr block_addr = 0;
-//     int block_index = 0;
-//     int it = 0;
-//     uint32_t slice = 0;
-//     bool hit_in_cache = false;
-
-//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             slice <<= 1;
-//             slice |= needsPush[it + i];
-//         }
-//         if (slice) {
-//             block_addr = getBlockAddrFromBitIndex(it);
-//             block_index = getBlockIndex(block_addr);
-//             if ((cacheBlocks[block_index].addr == block_addr) &&
-//                 (cacheBlocks[block_index].valid)) {
-//                 if (cacheBlocks[block_index].busyMask == 0) {
-//                     hit_in_cache = true;
-//                     break;
-//                 }
-//             } else {
-//                 hit_in_cache = false;
-//                 break;
-//             }
-//         }
-//     }
-
-//     assert(it < MAX_BITVECTOR_SIZE);
-//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-//         currentBitSliceIndex = 0;
-//     } else {
-//         currentBitSliceIndex = it + numElementsPerLine;
-//     }
-
-//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-//                         "in needsPush.\n", __func__, slice, it);
-
-//     if (hit_in_cache) {
-//         int push_needed = 0;
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             // TODO: Make this more programmable
-//             uint32_t new_prop = std::min(
-//                                 cacheBlocks[block_index].items[i].prop,
-//                                 cacheBlocks[block_index].items[i].tempProp);
-//             cacheBlocks[block_index].items[i].tempProp = new_prop;
-//             cacheBlocks[block_index].items[i].prop = new_prop;
-//             if (needsPush[it + i] == 1) {
-//                 peerPushEngine->recvWLItemRetry(
-//                     cacheBlocks[block_index].items[i]);
-//             }
-//             push_needed +=  needsPush[it + i];
-//             needsPush[it + i] = 0;
-//         }
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         if (applyQueue.find(block_index)) {
-//             applyQueue.erase(block_index);
-//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-//                 deschedule(nextApplyEvent);
-//             }
-//             if (cacheBlocks[block_index].hasConflict) {
-//                 writeBackQueue.push_back(block_index);
-//                 assert(writeBackQueue.size() <= numLines);
-//                 if ((!writeBackQueue.empty()) &&
-//                     (!nextWriteBackEvent.pending()) &&
-//                     (!nextWriteBackEvent.scheduled())) {
-//                     schedule(nextWriteBackEvent, nextCycle());
-//                 }
-//             }
-//         }
-//     } else {
-//         if (memPort.blocked()) {
-//             nextRecvPushRetryEvent.sleep();
-//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
-//             // Maximum three MemoryEvent.
-//             assert(pendingEventQueue.size() <= 3);
-//             return;
-//         }
-
-//         // FIXME: Fix the retry mechanism between memory and cache to
-//         // handle memory retries correctly. This probably requires scheduling
-//         // an event for sending the retry. For now we're enabling infinite
-//         // queueing in the memQueue.
-//         // FIXME: Also do not send requests for cache lines that are already
-//         // read but await data. Just set a flag or sth.
-//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-//         SenderState* sender_state = new SenderState(true);
-//         pkt->pushSenderState(sender_state);
-//         memPort.sendPacket(pkt);
-//     }
-
-//     numRetriesReceived--;
-//     assert(numRetriesReceived == 0);
-//     assert(!nextRecvPushRetryEvent.scheduled());
-// }
-
 std::tuple<bool, int>
 CoalesceEngine::getOptimalBitVectorSlice()
 {
-    bool hit_in_cache;
+    bool hit_in_cache = false;
     int slice_base = -1;
 
-    int score = 0;
-    int max_score_possible = 3 * numElementsPerLine;
+    // int score = 0;
+    // int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        int current_score = 0;
+        // int current_score = 0;
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
@@ -924,30 +833,32 @@ CoalesceEngine::getOptimalBitVectorSlice()
         if (current_popcount == 0) {
             continue;
         }
-        current_score += current_popcount;
+        // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
         if ((cacheBlocks[block_index].valid) &&
             (cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].busyMask == 0)) {
-            current_score += numElementsPerLine * 2;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = true;
-                if (score == max_score_possible) {
-                    break;
-                }
-            }
+            // current_score += numElementsPerLine * 2;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = true;
+            //     if (score == max_score_possible) {
+            //         break;
+            //     }
+            // }
+            return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
-            score += numElementsPerLine;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = false;
-                assert(score < max_score_possible);
-            }
+            // score += numElementsPerLine;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = false;
+            //     assert(score < max_score_possible);
+            // }
+            return std::make_tuple(false, it);
         }
     }
 
@@ -955,11 +866,11 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent()
+CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
+    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
 
     if (slice_base != -1) {
         Addr addr = getBlockAddrFromBitIndex(slice_base);
@@ -969,12 +880,12 @@ CoalesceEngine::processNextRecvPushRetryEvent()
             assert(cacheBlocks[block_index].busyMask == 0);
 
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-                                        "its MemRetry.\n", __func__);
-                recvMemRetry();
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+            //                             "its MemRetry.\n", __func__);
+            //     recvMemRetry();
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
@@ -1005,33 +916,38 @@ CoalesceEngine::processNextRecvPushRetryEvent()
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    writeBackQueue.push_back(block_index);
-                    assert(writeBackQueue.size() <= numLines);
-                    if ((!nextWriteBackEvent.pending()) &&
-                        (!nextWriteBackEvent.scheduled())) {
-                        schedule(nextWriteBackEvent, nextCycle());
-                    }
+                    // writeBackQueue.push_back(block_index);
+                    // assert(writeBackQueue.size() <= numLines);
+                    // if ((!nextWriteBackEvent.pending()) &&
+                    //     (!nextWriteBackEvent.scheduled())) {
+                    //     schedule(nextWriteBackEvent, nextCycle());
+                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                    // if ((!nextMemoryEvent.pending()) &&
+                    //     (!nextMemoryEvent.scheduled())) {
+                    //     schedule(nextMemoryEvent, nextCycle());
+                    // }
                 }
             }
         } else {
-            if (memPort.blocked()) {
-                assert(nextRecvPushRetryEvent.getPrevState() != -1);
-                nextRecvPushRetryEvent.setPrevState(-1);
-                nextRecvPushRetryEvent.sleep();
-                pendingEventQueue.push_back("nextRecvPushRetryEvent");
-                assert(pendingEventQueue.size() <= 3);
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-                                        "and has been pushed to pendingEventQueue."
-                                        " pendingEventQueue.size = %d.\n",
-                                        __func__, pendingEventQueue.size());
-                return;
-            }
+            // if (memPort.blocked()) {
+            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
+            //     nextRecvPushRetryEvent.setPrevState(-1);
+            //     nextRecvPushRetryEvent.sleep();
+            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
+            //     assert(pendingEventQueue.size() <= 3);
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+            //                             "and has been pushed to pendingEventQueue."
+            //                             " pendingEventQueue.size = %d.\n",
+            //                             __func__, pendingEventQueue.size());
+            //     return;
+            // }
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -1045,8 +961,15 @@ CoalesceEngine::processNextRecvPushRetryEvent()
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
+    // if (numRetriesReceived > 0) {
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    // }
     if (numRetriesReceived > 0) {
-        schedule(nextRecvPushRetryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f6ed4843fa..4036dc49af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -92,26 +92,30 @@ class CoalesceEngine : public BaseMemoryEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
-    std::deque<int> fillQueue;
+    // std::deque<int> fillQueue;
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int currentBitSliceIndex;
     int numRetriesReceived;
     InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    InOutSet<int> writeBackQueue;
+    // InOutSet<int> writeBackQueue;
+
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    std::deque<std::string> pendingEventQueue;
+    // std::deque<std::string> pendingEventQueue;
+
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
 
-    MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent();
+    // MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent(int block_index);
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -119,11 +123,11 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent();
+    // MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent(int block_index);
 
-    MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent();
+    // MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent(int slice_base);
 
     struct CoalesceStats : public statistics::Group
     {

From 0b726a7fb5f41ed35b04ece74b71a1a3d36edec0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 10:59:33 -0700
Subject: [PATCH 136/279] Adding more dprintfs.

---
 src/accl/graph/base/data_structs.hh       |  36 +-
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    | 676 ++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh    |  36 +-
 4 files changed, 275 insertions(+), 481 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f178d5a7e2..707b57c56f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,7 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <bitset>
-#include <queue>
-#include <unordered_set>
+#include <list>
 
 namespace gem5
 {
@@ -90,49 +88,51 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class InOutSet
+class UniqueFIFO
 {
   private:
-    std::unordered_set<T> set;
+    std::list<T> fifo;
 
   public:
-    InOutSet(int cap)
-    {
-        set.reserve(cap);
-    }
+    UniqueFIFO() {}
 
     void push_back(T item)
     {
-        if (set.find(item) == set.end()) {
-            set.insert(item);
+        if (!find(item)) {
+            fifo.push_back(item);
         }
     }
 
     void pop_front()
     {
-        assert(set.begin() != set.end());
-        set.erase(set.begin());
+        assert(!fifo.empty());
+        fifo.pop_front();
     }
 
     T front()
     {
-        return *(set.begin());
+        return fifo.front();
     }
 
     size_t size() {
-        return set.size();
+        return fifo.size();
     }
 
     bool empty() {
-        return (size() == 0);
+        return fifo.empty();
     }
 
     bool find(T item) {
-        return (set.find(item) != set.end());
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        return (it != fifo.end());
     }
 
     void erase(T item) {
-        set.erase(item);
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        assert(it != fifo.end());
+        fifo.erase(it);
     }
 };
 
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index c60d189e0f..a5d1d7e8e7 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -73,15 +73,15 @@ void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
-        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
-                                            __func__, blockedPacket->print());
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
     } else {
-        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
-                                                __func__, pkt->print());
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d7107274b..6ed94fe938 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,14 +46,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0), applyQueue(numLines),
-    // writeBackQueue(numLines),
-    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
-    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    numRetriesReceived(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -79,8 +81,6 @@ CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
-                                __func__, addr, trimmed_addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -108,21 +108,25 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHR.size() <= numMSHREntries);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
-                                                    __func__, addr);
-    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         // Hit
         // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextRespondEvent for latency cycles in
+        // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -138,12 +142,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                         cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
-        // TODO: Add a stat to count the number of WLItems that have been touched.
+        // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        if (!nextRespondEvent.scheduled()) {
-            schedule(nextRespondEvent, nextCycle());
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
@@ -151,44 +155,50 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
-                        "found in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                            "Rejecting request.\n", __func__);
+                                "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                                    "Rejecting request.\n",
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                                    "cacheBlocks[%d]. Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-
                     if ((cacheBlocks[block_index].busyMask == 0) &&
                         (cacheBlocks[block_index].valid)) {
-                        applyQueue.push_back(block_index);
-                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
-                                    "applyQueue.size = %u.\n", __func__,
-                                    block_index, applyQueue.size());
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+                                            "busy. It %s in the applyQueue.\n",
+                                            __func__, block_index,
+                            applyQueue.find(block_index) ? "is" : "is not");
+                        if (!applyQueue.find(block_index)) {
+                            applyQueue.push_back(block_index);
+                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
+                                        "applyQueue. applyQueue.size = %u.\n",
+                                    __func__, block_index, applyQueue.size());
+                        }
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
@@ -208,24 +218,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
-
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-
-                    // fillQueue.push_back(block_index);
-                    // assert(fillQueue.size() <= numLines);
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-                    // FIXME: Fix this DPRINTF
-                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
-                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                    //         __func__, fillQueue.size());
-                    // if ((!nextMemoryReadEvent.pending()) &&
-                    //     (!nextMemoryReadEvent.scheduled())) {
-                    //     schedule(nextMemoryReadEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                     if ((!nextMemoryEvent.pending()) &&
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
@@ -236,21 +240,23 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
-                        "in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                            "Rejecting request.\n",
-                            __func__, block_index);
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
                 stats.readRejections++;
                 return false;
             }
-            if ((!cacheBlocks[block_index].hasConflict) &&
-                (aligned_addr != cacheBlocks[block_index].addr)) {
+            if ((aligned_addr != cacheBlocks[block_index].addr)) {
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
+            } else {
+                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
             }
 
             if (aligned_addr != cacheBlocks[block_index].addr) {
@@ -260,295 +266,88 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHR[block_index].push_back(addr);
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                            "line[%d].\n", __func__, addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
         }
     }
 }
 
-void
-CoalesceEngine::processNextMemoryReadEvent(int block_index)
-{
-    // assert(!nextMemoryReadEvent.pending());
-    // if (memPort.blocked()) {
-    //     // TODO: Implement interface where events of the CoalesceEngine are
-    //     // pushed to a fifo to be scheduled later.
-    //     nextMemoryReadEvent.sleep();
-    //     pendingEventQueue.push_back("nextMemoryReadEvent");
-    //     // Maximum three MemoryEvents.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = fillQueue.front();
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
-    memPort.sendPacket(pkt);
-
-    // fillQueue.pop_front();
-
-    // if (!fillQueue.empty()) {
-    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
-    // }
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    Addr addr_response;
-    WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-
-    if ((!nextRespondEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    std::function<void(int)> next_memory_function;
-    int next_memory_function_input;
-    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
-    memoryFunctionQueue.pop_front();
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    // if (pendingEventQueue.empty()) {
-    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-    //     return;
-    // }
-
-    // std::string front = pendingEventQueue.front();
-    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    // if (front == "nextMemoryReadEvent") {
-    //     assert(!nextMemoryReadEvent.scheduled());
-    //     assert(nextMemoryReadEvent.pending());
-    //     schedule(nextMemoryReadEvent, nextCycle());
-    //     nextMemoryReadEvent.wake();
-    // } else if (front == "nextWriteBackEvent") {
-    //     assert(!nextWriteBackEvent.scheduled());
-    //     assert(nextWriteBackEvent.pending());
-    //     schedule(nextWriteBackEvent, nextCycle());
-    //     nextWriteBackEvent.wake();
-    // } else if (front == "nextRecvPushRetryEvent") {
-    //     assert(!nextRecvPushRetryEvent.scheduled());
-    //     assert(nextRecvPushRetryEvent.pending());
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    //     nextRecvPushRetryEvent.wake();
-    // } else {
-    //     panic("EVENT IS NOT RECOGNIZED.\n");
-    // }
-
-    // pendingEventQueue.pop_front();
-    // return;
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
     if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
-                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+
     if (pkt->findNextSenderState<SenderState>()) {
-        Addr addr = pkt->getAddr();
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
         int it = getBitIndexBase(addr);
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            // We read the address to send the wl but it is put in cache before
-            // the read response arrives.
-            if (cacheBlocks[block_index].busyMask == 0) {
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as idle.\n",
-                        __func__, addr);
-                int push_needed = 0;
-                // It is not busy anymore, we have to send the wl from cache.
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    // TODO: Make this more programmable
-                    uint32_t new_prop = std::min(
-                                        cacheBlocks[block_index].items[i].prop,
-                                        cacheBlocks[block_index].items[i].tempProp);
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    if (needsPush[it + i] == 1) {
-                        peerPushEngine->recvWLItemRetry(
-                            cacheBlocks[block_index].items[i]);
-                    }
-                    push_needed += needsPush[it + i];
-                    needsPush[it + i] = 0;
-                }
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                peerPushEngine->deallocatePushSpace(
-                                        numElementsPerLine - push_needed);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                // Since we have just applied the line, we can take it out of
-                // the applyQueue if it's in there. No need to do the same
-                // thing for evictQueue.
-                if (applyQueue.find(block_index)) {
-                    applyQueue.erase(block_index);
-                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                        deschedule(nextApplyEvent);
-                    }
-                    if (cacheBlocks[block_index].hasConflict) {
-                        // writeBackQueue.push_back(block_index);
-                        // assert(writeBackQueue.size() <= numLines);
-                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                        // if ((!nextWriteBackEvent.pending()) &&
-                        //     (!nextWriteBackEvent.scheduled())) {
-                        //     schedule(nextWriteBackEvent, nextCycle());
-                        // }
-                        // if ((!nextMemoryEvent.pending()) &&
-                        //     (!nextMemoryEvent.scheduled())) {
-                        //     schedule(nextMemoryEvent, nextCycle());
-                        // }
-                    }
-                }
-            } else {
-                // The line is busy. Therefore, we have to disregard the data
-                // we received from the memory and also tell the push engine to
-                // deallocate the space it allocated for this retry. However,
-                // we still have to rememeber that these items need a retry.
-                // i.e. don't change needsPush, call recvWLItemRetry with
-                // do_push = false
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as busy.\n",
-                        __func__, addr);
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                peerPushEngine->deallocatePushSpace(numElementsPerLine);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            }
-        } else {
-            // We have read the address to send the wl and it is not in the
-            // cache. Simply send the items to the PushEngine.
-            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was not found in the cache.\n",
-                        __func__, addr);
-            WorkListItem* items = pkt->getPtr<WorkListItem>();
-            int push_needed = 0;
-            // No applying of the line needed.
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                assert(!((needsPush[it + i] == 1) &&
-                                (items[i].degree == 0)));
-                if (needsPush[it + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(items[i]);
-                }
-                push_needed += needsPush[it + i];
-                needsPush[it + i] = 0;
+        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                    "for addr %lu. It was not found in the cache.\n",
+                    __func__, addr);
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        int push_needed = 0;
+        // No applying of the line needed.
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            assert(!((needsPush[it + i] == 1) &&
+                            (items[i].degree == 0)));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(items[i]);
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                    numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            push_needed += needsPush[it + i];
+            needsPush[it + i] = 0;
         }
-
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        peerPushEngine->deallocatePushSpace(
+                                numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        // }
         delete pkt;
         return true;
     }
 
-    Addr addr = pkt->getAddr();
-    // int block_index = (addr / peerMemoryAtomSize) % numLines;
-    int block_index = getBlockIndex(addr);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
-                __func__, pkt->getAddr());
-    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
-    //         (!cacheBlocks[block_index].valid) &&    // valid is false
-    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
-    assert(cacheBlocks[block_index].allocated);
-    assert(!cacheBlocks[block_index].valid);
-    assert(MSHR.find(block_index) != MSHR.end());
-    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+    if (cacheBlocks[block_index].addr == addr) {
+        assert(cacheBlocks[block_index].allocated);
+        assert(!cacheBlocks[block_index].valid);
+        assert(MSHR.find(block_index) != MSHR.end());
+        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
-
-    for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
-                block_index, i, cacheBlocks[block_index].items[i].to_string());
+        for (int i = 0; i < numElementsPerLine; i++) {
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                            __func__, block_index, i,
+                            cacheBlocks[block_index].items[i].to_string());
+        }
+        cacheBlocks[block_index].valid = true;
+        delete pkt;
     }
-    cacheBlocks[block_index].valid = true;
-    delete pkt;
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
-                        "be serviced with the received packet.\n",
-                        __func__, miss_addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -567,10 +366,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
-
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-                        "removal.\n", __func__, i, block_index);
+            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
+            //             "removal.\n", __func__, i, block_index);
         }
     }
 
@@ -593,19 +391,46 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(cacheBlocks[block_index].hasConflict);
     }
 
-    if ((!nextRespondEvent.scheduled()) &&
+    if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
+        schedule(nextResponseEvent, nextCycle());
     }
 
     return true;
 }
 
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    Addr addr_response;
+    WorkListItem worklist_response;
+
+    std::tie(addr_response, worklist_response) = responseQueue.front();
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(CoalesceEngine,
+                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
+
+    responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
@@ -691,12 +516,11 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if ((cacheBlocks[block_index].hasConflict) &&
         (cacheBlocks[block_index].busyMask == 0)) {
-        // writeBackQueue.push_back(block_index);
-        // assert(writeBackQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-        //                     "writeBackQueue.size = %u.\n", __func__,
-        //                         block_index, writeBackQueue.size());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextWriteBack(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
+                        "to memoryFunctionQueue.\n", __func__, block_index);
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
@@ -711,23 +535,47 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent(int block_index)
+CoalesceEngine::processNextMemoryEvent()
 {
-    // assert(!nextWriteBackEvent.pending());
-    // if (memPort.blocked()) {
-    //     nextWriteBackEvent.sleep();
-    //     pendingEventQueue.push_back("nextWriteBackEvent");
-    //     // Maximum three MemoryEvent.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = writeBackQueue.front();
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
+        return;
+    }
 
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index)
+{
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
+    memPort.sendPacket(pkt);
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index)
+{
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
 
@@ -749,6 +597,10 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
             memPort.sendPacket(write_pkt);
+        } else {
+            DPRINTF(CoalesceEngine, "%s: No change observed on "
+                            "cacheBlocks[%d]. No write back needed.\n",
+                                            __func__, block_index);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -756,7 +608,7 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                                     "cacheBlocks[%d] is Addr: %lu.\n",
                                     __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
         cacheBlocks[block_index].busyMask = 0;
@@ -766,53 +618,12 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        // fillQueue.push_back(block_index);
-        // assert(fillQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-        // if ((!nextMemoryReadEvent.pending()) &&
-        //     (!nextMemoryReadEvent.scheduled())){
-        //     schedule(nextMemoryReadEvent, nextCycle());
-        // }
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    }
-
-    // writeBackQueue.pop_front();
-    // assert(writeBackQueue.size() <= numLines);
-    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
-    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
-    //                 __func__, block_index, writeBackQueue.size(), numLines);
-
-    // if (!writeBackQueue.empty()) {
-        // assert(!nextWriteBackEvent.pending());
-        // assert(!nextWriteBackEvent.scheduled());
-        // schedule(nextWriteBackEvent, nextCycle());
-        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    // }
-}
 
-void
-CoalesceEngine::recvPushRetry()
-{
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    // assert(!nextRecvPushRetryEvent.pending());
-    // assert(!nextRecvPushRetryEvent.scheduled());
-    assert(numRetriesReceived == 1);
-    // schedule(nextRecvPushRetryEvent, nextCycle());
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextRead(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
+                            "memoryFunctionQueue.\n", __func__, block_index);
     }
 }
 
@@ -866,7 +677,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
@@ -879,14 +690,6 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-            //                             "its MemRetry.\n", __func__);
-            //     recvMemRetry();
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
@@ -916,39 +719,15 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    // writeBackQueue.push_back(block_index);
-                    // assert(writeBackQueue.size() <= numLines);
-                    // if ((!nextWriteBackEvent.pending()) &&
-                    //     (!nextWriteBackEvent.scheduled())) {
-                    //     schedule(nextWriteBackEvent, nextCycle());
-                    // }
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                    // if ((!nextMemoryEvent.pending()) &&
-                    //     (!nextMemoryEvent.scheduled())) {
-                    //     schedule(nextMemoryEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) {
+                        processNextWriteBack(block_index);
+                    }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
+                                        " input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                 }
             }
         } else {
-            // if (memPort.blocked()) {
-            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
-            //     nextRecvPushRetryEvent.setPrevState(-1);
-            //     nextRecvPushRetryEvent.sleep();
-            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
-            //     assert(pendingEventQueue.size() <= 3);
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-            //                             "and has been pushed to pendingEventQueue."
-            //                             " pendingEventQueue.size = %d.\n",
-            //                             __func__, pendingEventQueue.size());
-            //     return;
-            // }
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
@@ -961,18 +740,53 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
-    // if (numRetriesReceived > 0) {
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    // }
+
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
+        memoryFunctionQueue.emplace_back([this] (int slice_base) {
+            processNextPushRetry(slice_base);
+        }, 0);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvPushRetry()
+{
+    numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(numRetriesReceived == 1);
+
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) {
+        processNextPushRetry(slice_base);
+    }, 0);
+    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
+                                        "memoryFunctionQueue.\n", __func__);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
 }
 
+
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4036dc49af..7db09cec11 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -77,58 +77,40 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    // int nmpu;
-    // Addr memoryAddressOffset;
-
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block* cacheBlocks;
-
     int numLines;
     int numElementsPerLine;
+    Block* cacheBlocks;
 
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-
-    // std::deque<int> fillQueue;
-
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int numRetriesReceived;
-    InOutSet<int> applyQueue;
+    UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    // InOutSet<int> writeBackQueue;
-
-
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    // std::deque<std::string> pendingEventQueue;
-
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
+    void processNextRead(int block_index);
+    void processNextWriteBack(int block_index);
+    void processNextPushRetry(int slice_base);
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
 
-    // MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent(int block_index);
-
-    EventFunctionWrapper nextRespondEvent;
-    void processNextRespondEvent();
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    // MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent(int block_index);
-
-    // MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent(int slice_base);
-
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
@@ -164,8 +146,6 @@ class CoalesceEngine : public BaseMemoryEngine
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();
-
-    // virtual void startup() override;
 };
 
 }

From 14d331eaa519176686ad8e45a703ddb2344e3d0d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 16:59:30 -0700
Subject: [PATCH 137/279] Fixing cache block state machine. wip.

---
 src/accl/graph/sega/SConscript         |   1 +
 src/accl/graph/sega/coalesce_engine.cc | 385 ++++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  31 +-
 src/accl/graph/sega/state_machine.md   |   1 +
 4 files changed, 368 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/sega/state_machine.md

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 97a62d44a0..81a29df6af 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,5 +43,6 @@ DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
+DebugFlag('CacheBlockState')
 DebugFlag('PushEngine')
 DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6ed94fe938..a0c85de2f5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/CacheBlockState.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -104,11 +105,180 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
+// TODO: Prev implementaton of recvWLRead. Remove
+// bool
+// CoalesceEngine::recvWLRead(Addr addr)
+// {
+//     assert(MSHR.size() <= numMSHREntries);
+
+//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+//     assert(aligned_addr % peerMemoryAtomSize == 0);
+//     int block_index = getBlockIndex(aligned_addr);
+//     assert(block_index < numLines);
+//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+//     assert(wl_offset < numElementsPerLine);
+//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+//                         "This request maps to cacheBlocks[%d], aligned_addr: "
+//                         "%lu, and wl_offset: %d.\n", __func__, addr,
+//                         block_index, aligned_addr, wl_offset);
+
+//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
+//         (cacheBlocks[block_index].valid)) {
+//         assert(cacheBlocks[block_index].allocated);
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+//         // Hit
+//         // TODO: Add a hit latency as a param for this object.
+//         // Can't just schedule the nextResponseEvent for latency cycles in
+//         // the future.
+//         responseQueue.push_back(std::make_tuple(addr,
+//                     cacheBlocks[block_index].items[wl_offset]));
+//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         // TODO: Stat to count the number of WLItems that have been touched.
+//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+//         stats.readHits++;
+
+//         if (!nextResponseEvent.scheduled()) {
+//             schedule(nextResponseEvent, nextCycle());
+//         }
+//         stats.numVertexReads++;
+//         return true;
+//     } else {
+//         // miss
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+//         if (MSHR.find(block_index) == MSHR.end()) {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
+//             assert(MSHR.size() <= numMSHREntries);
+//             if (MSHR.size() == numMSHREntries) {
+//                 // Out of MSHR entries
+//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+//                                 "Rejecting request.\n", __func__);
+//                 // TODO: Break out read rejections into more than one stat
+//                 // based on the cause of the rejection
+//                 stats.readRejections++;
+//                 return false;
+//             } else {
+//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
+//                     "entries available.\n", __func__);
+//                 if (cacheBlocks[block_index].allocated) {
+//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                                 "with Addr: %lu.\n", __func__, addr,
+//                                 cacheBlocks[block_index].addr);
+//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                                     "cacheBlocks[%d]. Rejecting request.\n",
+//                                     __func__, block_index);
+//                         stats.readRejections++;
+//                         return false;
+//                     }
+//                     cacheBlocks[block_index].hasConflict = true;
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     if ((cacheBlocks[block_index].busyMask == 0) &&
+//                         (cacheBlocks[block_index].valid)) {
+//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+//                                             "busy. It %s in the applyQueue.\n",
+//                                             __func__, block_index,
+//                             applyQueue.find(block_index) ? "is" : "is not");
+//                         if (!applyQueue.find(block_index)) {
+//                             applyQueue.push_back(block_index);
+//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
+//                                         "applyQueue. applyQueue.size = %u.\n",
+//                                     __func__, block_index, applyQueue.size());
+//                         }
+//                         assert(!applyQueue.empty());
+//                         if ((!nextApplyEvent.scheduled())) {
+//                             schedule(nextApplyEvent, nextCycle());
+//                         }
+//                     }
+//                     return true;
+//                 } else {
+//                     assert(!cacheBlocks[block_index].valid);
+//                     assert(MSHR[block_index].size() == 0);
+//                     // MSHR available and no conflict
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+//                                             "Allocating a cache line for it.\n"
+//                                                             , __func__, addr);
+
+//                     cacheBlocks[block_index].addr = aligned_addr;
+//                     cacheBlocks[block_index].busyMask = 0;
+//                     cacheBlocks[block_index].allocated = true;
+//                     cacheBlocks[block_index].valid = false;
+//                     cacheBlocks[block_index].hasConflict = false;
+//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+//                                 " Addr: %lu.\n", __func__, block_index, addr);
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     memoryFunctionQueue.emplace_back(
+//                         [this] (int block_index) {
+//                             processNextRead(block_index);
+//                         }, block_index);
+//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+//                                         "input %d to memoryFunctionQueue.\n",
+//                                                     __func__, block_index);
+//                     if ((!nextMemoryEvent.pending()) &&
+//                         (!nextMemoryEvent.scheduled())) {
+//                         schedule(nextMemoryEvent, nextCycle());
+//                     }
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     return true;
+//                 }
+//             }
+//         } else {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                             "cacheBlocks[%d]. Rejecting request.\n",
+//                                             __func__, block_index);
+//                 stats.readRejections++;
+//                 return false;
+//             }
+//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
+//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                             "with Addr: %lu.\n", __func__, addr,
+//                             cacheBlocks[block_index].addr);
+//                 cacheBlocks[block_index].hasConflict = true;
+//             } else {
+//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
+//                             "for cacheBlocks[%d].\n", __func__, block_index);
+//             }
+
+//             if (aligned_addr != cacheBlocks[block_index].addr) {
+//                 stats.readMisses++;
+//             } else {
+//                 stats.readHitUnderMisses++;
+//             }
+
+//             MSHR[block_index].push_back(addr);
+//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
+//             stats.numVertexReads++;
+//             return true;
+//         }
+//     }
+// }
+
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHR.size() <= numMSHREntries);
-
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = getBlockIndex(aligned_addr);
@@ -119,11 +289,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "This request maps to cacheBlocks[%d], aligned_addr: "
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
-        assert(cacheBlocks[block_index].allocated);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
         // Hit
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
@@ -144,20 +321,60 @@ CoalesceEngine::recvWLRead(Addr addr)
                         peerWLEngine->getRegisterFileSize());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        stats.readHits++;
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
 
         if (!nextResponseEvent.scheduled()) {
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.readRejections++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        return true;
     } else {
         // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
@@ -169,11 +386,12 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
                     "entries available.\n", __func__);
-                if (cacheBlocks[block_index].allocated) {
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                                     "cacheBlocks[%d]. Rejecting request.\n",
@@ -181,43 +399,116 @@ CoalesceEngine::recvWLRead(Addr addr)
                         stats.readRejections++;
                         return false;
                     }
-                    cacheBlocks[block_index].hasConflict = true;
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextWriteBack(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextRead(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
+                    // TODO: Add readConflicts here.
                     stats.numVertexReads++;
-                    if ((cacheBlocks[block_index].busyMask == 0) &&
-                        (cacheBlocks[block_index].valid)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-                                            "busy. It %s in the applyQueue.\n",
-                                            __func__, block_index,
-                            applyQueue.find(block_index) ? "is" : "is not");
-                        if (!applyQueue.find(block_index)) {
-                            applyQueue.push_back(block_index);
-                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
-                                        "applyQueue. applyQueue.size = %u.\n",
-                                    __func__, block_index, applyQueue.size());
-                        }
-                        assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled())) {
-                            schedule(nextApplyEvent, nextCycle());
-                        }
-                    }
                     return true;
                 } else {
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
                                                             , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
 
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    // cacheBlocks[block_index].allocated = true;
+                    // cacheBlocks[block_index].hasConflict = false;
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
@@ -234,6 +525,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
                     }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -241,7 +535,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
         } else {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
@@ -249,21 +547,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readRejections++;
                 return false;
             }
-            if ((aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                            "with Addr: %lu.\n", __func__, addr,
-                            cacheBlocks[block_index].addr);
-                cacheBlocks[block_index].hasConflict = true;
-            } else {
-                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
-            }
 
-            if (aligned_addr != cacheBlocks[block_index].addr) {
-                stats.readMisses++;
-            } else {
-                stats.readHitUnderMisses++;
-            }
+            // cacheBlocks[block_index].hasConflict = true;
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
@@ -324,8 +613,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if (cacheBlocks[block_index].addr == addr) {
-        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                "fill cacheBlocks[%d].\n", __func__, block_index);
         assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
         pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
@@ -335,6 +631,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].pendingData = false;
         delete pkt;
     }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7db09cec11..e7655a069e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -34,6 +34,7 @@
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
@@ -51,24 +52,42 @@ class CoalesceEngine : public BaseMemoryEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t busyMask;
-        bool allocated;
+        uint64_t busyMask;
         bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+
+        bool allocated;
         bool hasConflict;
-        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
           busyMask(0),
-          allocated(false),
           valid(false),
-          hasConflict(false),
-          dirty(false)
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          allocated(false),
+          hasConflict(false)
         {
           items = new WorkListItem [num_elements];
         }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
+                valid ? "true" : "false", needsApply ? "true" : "false",
+                needsWB ? "true" : "false", pendingData ? "true" : "false",
+                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+        }
     };
 
     struct SenderState : public Packet::SenderState
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file

From 30e94f36e8af4cd098303c1d665bcacd268ed6e2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 30 Jul 2022 23:14:08 -0700
Subject: [PATCH 138/279] Fixing cache block state machine. cont. wip

---
 src/accl/graph/sega/coalesce_engine.cc | 288 +++++++++----------------
 1 file changed, 98 insertions(+), 190 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a0c85de2f5..8f33a2d893 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -105,177 +105,6 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
-// TODO: Prev implementaton of recvWLRead. Remove
-// bool
-// CoalesceEngine::recvWLRead(Addr addr)
-// {
-//     assert(MSHR.size() <= numMSHREntries);
-
-//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-//     assert(aligned_addr % peerMemoryAtomSize == 0);
-//     int block_index = getBlockIndex(aligned_addr);
-//     assert(block_index < numLines);
-//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-//     assert(wl_offset < numElementsPerLine);
-//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-//                         "This request maps to cacheBlocks[%d], aligned_addr: "
-//                         "%lu, and wl_offset: %d.\n", __func__, addr,
-//                         block_index, aligned_addr, wl_offset);
-
-//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
-//         (cacheBlocks[block_index].valid)) {
-//         assert(cacheBlocks[block_index].allocated);
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-//         // Hit
-//         // TODO: Add a hit latency as a param for this object.
-//         // Can't just schedule the nextResponseEvent for latency cycles in
-//         // the future.
-//         responseQueue.push_back(std::make_tuple(addr,
-//                     cacheBlocks[block_index].items[wl_offset]));
-//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         // TODO: Stat to count the number of WLItems that have been touched.
-//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-//         stats.readHits++;
-
-//         if (!nextResponseEvent.scheduled()) {
-//             schedule(nextResponseEvent, nextCycle());
-//         }
-//         stats.numVertexReads++;
-//         return true;
-//     } else {
-//         // miss
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-//         if (MSHR.find(block_index) == MSHR.end()) {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-//             assert(MSHR.size() <= numMSHREntries);
-//             if (MSHR.size() == numMSHREntries) {
-//                 // Out of MSHR entries
-//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-//                                 "Rejecting request.\n", __func__);
-//                 // TODO: Break out read rejections into more than one stat
-//                 // based on the cause of the rejection
-//                 stats.readRejections++;
-//                 return false;
-//             } else {
-//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
-//                     "entries available.\n", __func__);
-//                 if (cacheBlocks[block_index].allocated) {
-//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                                 "with Addr: %lu.\n", __func__, addr,
-//                                 cacheBlocks[block_index].addr);
-//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                                     "cacheBlocks[%d]. Rejecting request.\n",
-//                                     __func__, block_index);
-//                         stats.readRejections++;
-//                         return false;
-//                     }
-//                     cacheBlocks[block_index].hasConflict = true;
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     if ((cacheBlocks[block_index].busyMask == 0) &&
-//                         (cacheBlocks[block_index].valid)) {
-//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-//                                             "busy. It %s in the applyQueue.\n",
-//                                             __func__, block_index,
-//                             applyQueue.find(block_index) ? "is" : "is not");
-//                         if (!applyQueue.find(block_index)) {
-//                             applyQueue.push_back(block_index);
-//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
-//                                         "applyQueue. applyQueue.size = %u.\n",
-//                                     __func__, block_index, applyQueue.size());
-//                         }
-//                         assert(!applyQueue.empty());
-//                         if ((!nextApplyEvent.scheduled())) {
-//                             schedule(nextApplyEvent, nextCycle());
-//                         }
-//                     }
-//                     return true;
-//                 } else {
-//                     assert(!cacheBlocks[block_index].valid);
-//                     assert(MSHR[block_index].size() == 0);
-//                     // MSHR available and no conflict
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-//                                             "Allocating a cache line for it.\n"
-//                                                             , __func__, addr);
-
-//                     cacheBlocks[block_index].addr = aligned_addr;
-//                     cacheBlocks[block_index].busyMask = 0;
-//                     cacheBlocks[block_index].allocated = true;
-//                     cacheBlocks[block_index].valid = false;
-//                     cacheBlocks[block_index].hasConflict = false;
-//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-//                                 " Addr: %lu.\n", __func__, block_index, addr);
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     memoryFunctionQueue.emplace_back(
-//                         [this] (int block_index) {
-//                             processNextRead(block_index);
-//                         }, block_index);
-//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-//                                         "input %d to memoryFunctionQueue.\n",
-//                                                     __func__, block_index);
-//                     if ((!nextMemoryEvent.pending()) &&
-//                         (!nextMemoryEvent.scheduled())) {
-//                         schedule(nextMemoryEvent, nextCycle());
-//                     }
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     return true;
-//                 }
-//             }
-//         } else {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
-//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                             "cacheBlocks[%d]. Rejecting request.\n",
-//                                             __func__, block_index);
-//                 stats.readRejections++;
-//                 return false;
-//             }
-//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
-//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                             "with Addr: %lu.\n", __func__, addr,
-//                             cacheBlocks[block_index].addr);
-//                 cacheBlocks[block_index].hasConflict = true;
-//             } else {
-//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
-//                             "for cacheBlocks[%d].\n", __func__, block_index);
-//             }
-
-//             if (aligned_addr != cacheBlocks[block_index].addr) {
-//                 stats.readMisses++;
-//             } else {
-//                 stats.readHitUnderMisses++;
-//             }
-
-//             MSHR[block_index].push_back(addr);
-//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
-//             stats.numVertexReads++;
-//             return true;
-//         }
-//     }
-// }
-
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
@@ -615,6 +444,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                 "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].needsWB);
@@ -632,6 +463,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         delete pkt;
     }
 
@@ -639,7 +472,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr =
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -662,6 +496,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
             // End of the said block
             servicedIndices.push_back(i);
             // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
@@ -677,15 +513,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
-                    __func__, print_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
+                        "and is removed.\n", __func__, print_addr);
     }
 
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        cacheBlocks[block_index].hasConflict = false;
-    } else {
-        assert(cacheBlocks[block_index].hasConflict);
+        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -726,37 +560,111 @@ CoalesceEngine::processNextResponseEvent()
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
-                __func__, wl.to_string(), addr);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, wl.to_string(),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].dirty = true;
+        cacheBlocks[block_index].items[wl_offset] = wl;
+        cacheBlocks[block_index].needsApply |= true;
+        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
+        // set that bit.
         stats.numVertexWrites++;
     }
 
-    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
-                    " It does not have any taken items anymore.\n",
-                    __func__, block_index);
-        applyQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                __func__, block_index, applyQueue.size());
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextWriteBack(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
     }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {

From 32385c4e91bb59ee80edc684d122af2d31f08213 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 31 Jul 2022 14:32:04 -0700
Subject: [PATCH 139/279] Completed cache block state machine. Needs rework of
 push interface.

---
 src/accl/graph/sega/coalesce_engine.cc | 205 +++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |   7 +-
 2 files changed, 109 insertions(+), 103 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f33a2d893..904889f12b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -198,7 +198,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        assert(cacheBlocks[block_index].addr != aligned_addr);
+        // FIXME: Kake this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        // assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
@@ -220,14 +224,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                                    "cacheBlocks[%d]. Rejecting request.\n",
-                                    __func__, block_index);
-                        stats.readRejections++;
-                        return false;
-                    }
                     if ((cacheBlocks[block_index].valid) &&
                         (cacheBlocks[block_index].busyMask == 0) &&
                         (!cacheBlocks[block_index].pendingApply) &&
@@ -288,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].needsApply = false;
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].pendingWB = false;
                             memoryFunctionQueue.emplace_back(
                                 [this] (int block_index) {
                                     processNextRead(block_index);
@@ -323,7 +319,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     assert(cacheBlocks[block_index].busyMask == 0);
                     assert(!cacheBlocks[block_index].needsWB);
                     assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingData);
                     assert(!cacheBlocks[block_index].pendingApply);
                     assert(!cacheBlocks[block_index].pendingWB);
                     assert(MSHR[block_index].size() == 0);
@@ -607,6 +603,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextApplyEvent.scheduled())) {
+                schedule(nextApplyEvent, nextCycle());
+            }
         } else {
             assert(MSHR.size() <= numMSHREntries);
             // cache line has conflict.
@@ -666,70 +666,71 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
 
-    if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-
 }
 
 void
 CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
 
-    if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                    "apply process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
-                    "Therefore, no apply needed.\n", __func__, block_index);
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
-                                                    __func__, block_index);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-
-            if (new_prop != old_prop) {
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
-                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                int bit_index =
-                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if ((cacheBlocks[block_index].items[i].degree != 0) &&
-                    (needsPush[bit_index] == 0)) {
-                    // If the respective bit in the bit vector is set
-                    // there is no need to try and resend it.
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
+            uint32_t new_prop = std::min(current_prop,
+                            cacheBlocks[block_index].items[index].tempProp);
+            if (new_prop != current_prop) {
+                cacheBlocks[block_index].items[index].tempProp = new_prop;
+                cacheBlocks[block_index].items[index].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
+                    __func__, cacheBlocks[block_index].addr, index,
+                    cacheBlocks[block_index].items[index].to_string());
+
+                int bit_index_base =
+                            getBitIndexBase(cacheBlocks[block_index].addr);
+                if ((needsPush[bit_index_base + index] == 0) &&
+                    (cacheBlocks[block_index].items[index].degree != 0)) {
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
+                            cacheBlocks[block_index].items[index]);
                     } else {
-                        needsPush[bit_index] = 1;
+                        needsPush[bit_index_base + index] = 1;
                     }
                 }
             }
         }
-    }
+        cacheBlocks[block_index].needsWB = true;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
 
-    // TODO: This is where eviction policy goes
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (cacheBlocks[block_index].busyMask == 0)) {
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
+        assert(MSHR.size() < numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            memoryFunctionQueue.emplace_back([this] (int block_index) {
                 processNextWriteBack(block_index);
             }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
-                        "to memoryFunctionQueue.\n", __func__, block_index);
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
         }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 
     applyQueue.pop_front();
@@ -770,6 +771,17 @@ CoalesceEngine::processNextMemoryEvent()
 void
 CoalesceEngine::processNextRead(int block_index)
 {
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -781,54 +793,53 @@ CoalesceEngine::processNextRead(int block_index)
 void
 CoalesceEngine::processNextWriteBack(int block_index)
 {
-    // Why would we write it back if it does not have a conflict?
-    assert(cacheBlocks[block_index].hasConflict);
-
-    if ((cacheBlocks[block_index].busyMask != 0) ||
-        (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                "writeback process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        // FIXME: Fix the name of this stat.
-        stats.falseEvictSchedules++;
-    } else {
-        if (cacheBlocks[block_index].dirty) {
-            DPRINTF(CoalesceEngine,  "%s: Change observed on "
-                    "cacheBlocks[%d].\n", __func__, block_index);
-            PacketPtr write_pkt = createWritePacket(
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+
+    // Why would we write it back if it does not have a conflict.
+    assert(MSHR.size() <= numMSHREntries);
+    assert(MSHR.find(block_index) != MSHR.end());
+    if (cacheBlocks[block_index].pendingWB) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsApply);
+        PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
-                        write_pkt->getAddr(), write_pkt->getSize());
-            memPort.sendPacket(write_pkt);
-        } else {
-            DPRINTF(CoalesceEngine, "%s: No change observed on "
-                            "cacheBlocks[%d]. No write back needed.\n",
-                                            __func__, block_index);
-        }
-        assert(!MSHR[block_index].empty());
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
         Addr miss_addr = MSHR[block_index].front();
-        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
-                                    "cacheBlocks[%d] is Addr: %lu.\n",
-                                    __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].allocated = true;
         cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = true;
-        cacheBlocks[block_index].dirty = false;
-        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
-                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
         memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextRead(block_index);
-            }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
-                            "memoryFunctionQueue.\n", __func__, block_index);
+            processNextRead(block_index);
+        }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 }
 
@@ -866,7 +877,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             // }
             return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].allocated))) {
+                    (cacheBlocks[block_index].pendingData))) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e7655a069e..2ba0b62aaf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,9 +59,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
-
-        bool allocated;
-        bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -73,9 +70,7 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false),
-          allocated(false),
-          hasConflict(false)
+          pendingWB(false)
         {
           items = new WorkListItem [num_elements];
         }

From aa08cf516b155136cb53488cfd6ef3e699807553 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 2 Aug 2022 22:33:54 -0700
Subject: [PATCH 140/279] Fixing scheduling error of memory functions.

---
 src/accl/graph/SConscript              |  32 -----
 src/accl/graph/base/data_structs.hh    |   2 +-
 src/accl/graph/sega/SConscript         |   9 +-
 src/accl/graph/sega/coalesce_engine.cc | 176 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |  24 ++--
 5 files changed, 120 insertions(+), 123 deletions(-)
 delete mode 100644 src/accl/graph/SConscript

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
deleted file mode 100644
index 5dffd1a396..0000000000
--- a/src/accl/graph/SConscript
+++ /dev/null
@@ -1,32 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2016 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Import('*')
-
-DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 707b57c56f..830f1ecc16 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -47,7 +47,7 @@ struct __attribute__ ((packed)) WorkListItem
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
         tempProp, prop, degree, edgeIndex);
     }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 81a29df6af..4c398b5ccd 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,10 +39,15 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
+DebugFlag('BaseMemoryEngine')
+DebugFlag('BitVector')
 DebugFlag('CenteralController')
-DebugFlag('CoalesceEngine')
 DebugFlag('CacheBlockState')
+DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
+DebugFlag('SEGAStructureSize')
 DebugFlag('WLEngine')
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 904889f12b..da2bc54c19 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,8 +33,9 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/CoalesceEngine.hh"
+#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -76,6 +77,13 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+DrainState
+CoalesceEngine::drain()
+{
+    DPRINTF(CoalesceEngine, "%s: drain called.\n");
+    return DrainState::Drained;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -156,6 +164,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 
@@ -198,7 +207,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Kake this assert work. It will break if the cache block
+        // FIXME: Make this assert work. It will break if the cache block
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
@@ -258,10 +267,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
                             "to be written back.\n", __func__, block_index);
                             cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextWriteBack(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextWriteBack for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -274,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                     "%s.\n", __func__, block_index,
                                     cacheBlocks[block_index].to_string());
                         } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
                                             "not need to be written back.\n",
                                                         __func__, block_index);
                             cacheBlocks[block_index].addr = aligned_addr;
@@ -285,10 +295,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
                             cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextRead(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextRead for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -332,17 +343,16 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
-                    // cacheBlocks[block_index].allocated = true;
-                    // cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
                                         "input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -415,7 +425,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         int push_needed = 0;
         // No applying of the line needed.
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -427,7 +437,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             push_needed += needsPush[it + i];
             needsPush[it + i] = 0;
         }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(
                                 numElementsPerLine - push_needed);
@@ -459,6 +469,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         delete pkt;
@@ -492,6 +503,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             // End of the said block
@@ -590,6 +602,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -600,6 +613,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if ((cacheBlocks[block_index].busyMask == 0)) {
         if (cacheBlocks[block_index].needsApply) {
             cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
@@ -617,10 +631,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
                                             " back.\n", __func__, block_index);
                     cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextWriteBack(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -645,10 +660,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -710,15 +726,18 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
 
         assert(MSHR.size() < numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);
             cacheBlocks[block_index].pendingWB = true;
-            memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextWriteBack(block_index);
-            }, block_index);
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
             DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
                     " %d to memoryFunctionQueue.\n", __func__, block_index);
             if ((!nextMemoryEvent.pending()) &&
@@ -750,12 +769,14 @@ CoalesceEngine::processNextMemoryEvent()
 
     DPRINTF(CoalesceEngine, "%s: Processing another "
                         "memory function.\n", __func__);
-    std::function<void(int)> next_memory_function;
+    std::function<void(int, Tick)> next_memory_function;
     int next_memory_function_input;
+    Tick next_memory_function_tick;
     std::tie(
         next_memory_function,
-        next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
@@ -769,12 +790,16 @@ CoalesceEngine::processNextMemoryEvent()
 }
 
 void
-CoalesceEngine::processNextRead(int block_index)
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
                                             __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    //
+
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].needsWB);
@@ -791,23 +816,25 @@ CoalesceEngine::processNextRead(int block_index)
 }
 
 void
-CoalesceEngine::processNextWriteBack(int block_index)
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-
-    // Why would we write it back if it does not have a conflict.
-    assert(MSHR.size() <= numMSHREntries);
-    assert(MSHR.find(block_index) != MSHR.end());
-    if (cacheBlocks[block_index].pendingWB) {
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
         assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -833,13 +860,21 @@ CoalesceEngine::processNextWriteBack(int block_index)
         cacheBlocks[block_index].pendingData = true;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
-            processNextRead(block_index);
-        }, block_index);
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
     }
 }
 
@@ -863,9 +898,14 @@ CoalesceEngine::getOptimalBitVectorSlice()
         // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
-        if ((cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].busyMask == 0)) {
+        // Idle state: valid && !pendingApply && !pendingWB
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].busyMask == 0) &&
+            (!cacheBlocks[block_index].pendingApply) &&
+            (!cacheBlocks[block_index].pendingWB)) {
+            assert(!cacheBlocks[block_index].needsApply);
+            assert(!cacheBlocks[block_index].pendingData);
             // current_score += numElementsPerLine * 2;
             // if (current_score > score) {
             //     score = current_score;
@@ -876,8 +916,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             //     }
             // }
             return std::make_tuple(true, it);
-        } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].pendingData))) {
+        } else if (cacheBlocks[block_index].addr != addr) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
@@ -893,7 +932,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -907,17 +946,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
             assert(cacheBlocks[block_index].busyMask == 0);
 
             int push_needed = 0;
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
             assert(peerPushEngine->getNumRetries() == needsPush.count());
 
             for (int i = 0; i < numElementsPerLine; i++) {
-                // TODO: Make this more programmable
-                uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
                 if (needsPush[slice_base + i] == 1) {
                     peerPushEngine->recvWLItemRetry(
                         cacheBlocks[block_index].items[i]);
@@ -925,24 +958,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
                 push_needed +=  needsPush[slice_base + i];
                 needsPush[slice_base + i] = 0;
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            peerPushEngine->deallocatePushSpace(
+                                            numElementsPerLine - push_needed);
             assert(peerPushEngine->getNumRetries() == needsPush.count());
-            if (applyQueue.find(block_index)) {
-                applyQueue.erase(block_index);
-                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                    deschedule(nextApplyEvent);
-                }
-                if (cacheBlocks[block_index].hasConflict) {
-                    memoryFunctionQueue.emplace_back([this] (int block_index) {
-                        processNextWriteBack(block_index);
-                    }, block_index);
-                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
-                                        " input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                }
-            }
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -958,9 +978,10 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
     }
 
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) {
-            processNextPushRetry(slice_base);
-        }, 0);
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextPushRetry(slice_base, schedule_tick);
+        }, 0, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
@@ -990,9 +1011,10 @@ CoalesceEngine::recvPushRetry()
     assert(numRetriesReceived == 1);
 
     // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) {
-        processNextPushRetry(slice_base);
-    }, 0);
+    memoryFunctionQueue.emplace_back(
+        [this] (int slice_base, Tick schedule_tick) {
+        processNextPushRetry(slice_base, schedule_tick);
+    }, 0, curTick());
     DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
                                         "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2ba0b62aaf..ce6e0daca6 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,6 +59,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
+        Tick lastChangedTick;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -70,7 +71,8 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false)
+          pendingWB(false),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
@@ -78,10 +80,11 @@ class CoalesceEngine : public BaseMemoryEngine
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
                 "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
-                valid ? "true" : "false", needsApply ? "true" : "false",
-                needsWB ? "true" : "false", pendingData ? "true" : "false",
-                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
         }
     };
 
@@ -114,10 +117,10 @@ class CoalesceEngine : public BaseMemoryEngine
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
-    void processNextRead(int block_index);
-    void processNextWriteBack(int block_index);
-    void processNextPushRetry(int slice_base);
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextPushRetry(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
@@ -151,12 +154,11 @@ class CoalesceEngine : public BaseMemoryEngine
 
   public:
     PARAMS(CoalesceEngine);
-
     CoalesceEngine(const Params &params);
+    virtual DrainState drain() override;
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();

From 0279e8aa4df8723333dc419a42aac73bad111167 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 3 Aug 2022 12:41:28 -0700
Subject: [PATCH 141/279] Fixing incorrect assert.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index da2bc54c19..21dd746aad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -728,7 +728,7 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
 
-        assert(MSHR.size() < numMSHREntries);
+        assert(MSHR.size() <= numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);

From 15c5d55626f6d49048b549ecb679c517adf38e1f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 5 Aug 2022 13:37:54 -0700
Subject: [PATCH 142/279] Updating memory address mapping and interface for
 push coalesce.

---
 configs/accl/sega.py                      | 30 ++++++++++-------
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  3 +-
 src/accl/graph/base/data_structs.hh       | 19 +++++++++++
 src/accl/graph/sega/PushEngine.py         |  3 +-
 src/accl/graph/sega/push_engine.cc        | 40 ++++++++++++++++-------
 src/accl/graph/sega/push_engine.hh        | 35 +++++++++++++++-----
 7 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7577331f2b..26488ef69d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -8,20 +8,23 @@
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=2,
+        self.push_engine = PushEngine(base_edge_addr=0,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    resp_queue_size=64)
+        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
+        #                             push_req_queue_size=32,
+        #                             attached_memory_atom_size=64,
+        #                             resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="32B",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1)
+                                    cache_size="8MiB",
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                register_file_size=32)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -74,10 +77,15 @@ def __init__(self,
                             latency="30ns")
             )
             edge_mem_ctrl.append(
-                SimpleMemory(range=self._edge_ranges[i],
+                # SimpleMemory(range=self._edge_ranges[i],
+                #             bandwidth="4.8GB/s",
+                #             latency="30ns",
+                #             image_file=f"{graph_path}/edgelist_{i}")
+                SimpleMemory(range=AddrRange(self._edge_chunk_size),
                             bandwidth="4.8GB/s",
                             latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}")
+                            image_file=f"{graph_path}/edgelist_{i}",
+                            in_addr_map=False)
             )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 38a8662ed0..ade95800d2 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -31,7 +31,7 @@
 namespace gem5
 {
 
-BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+BaseReduceEngine::BaseReduceEngine(const Params &params):
     ClockedObject(params),
     system(params.system),
     _requestorId(system->getRequestorId(this))
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index c8c9784ed1..268bb60b76 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -47,8 +47,7 @@ class BaseReduceEngine : public ClockedObject
 
   public:
     PARAMS(BaseReduceEngine);
-
-    BaseReduceEngine(const BaseReduceEngineParams &params);
+    BaseReduceEngine(const Params &params);
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 830f1ecc16..6f775d8a38 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -78,15 +78,34 @@ struct __attribute__ ((packed)) Edge
         return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
     }
 
+    Edge(): weight(0), neighbor(0) {}
+
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
         neighbor(neighbor)
     {}
+
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
+struct CompleteEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
+        src(src), dst(dst), weight(weight)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+                                                    src, dst, weight);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 447731219e..a45f5d6ead 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,8 +35,7 @@ class PushEngine(BaseMemoryEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("The base address for the "
-                                    "attached edge memory")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d87462d7dd..d071e8fd37 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -38,7 +38,6 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
     onTheFlyMemReqs(0),
@@ -140,12 +139,12 @@ PushEngine::recvWLItem(WorkListItem wl)
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
     DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -162,12 +161,12 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
     DPRINTF(PushEngine, "%s: Received %s with retry.\n",
                                 __func__, wl.to_string());
 
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
@@ -191,22 +190,24 @@ PushEngine::processNextMemoryReadEvent()
         Addr aligned_addr, offset;
         int num_edges;
 
-        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = pushReqQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
-                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         reqOffsetMap[pkt->req] = offset;
         reqNumEdgeMap[pkt->req] = num_edges;
         reqValueMap[pkt->req] = curr_info.value();
+        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
         onTheFlyMemReqs++;
 
         if (curr_info.done()) {
-            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             pushReqQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                         "pushReqQueue.size() = %u.\n",
@@ -228,9 +229,6 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
-        // schedule(nextMemoryReadEvent, nextCycle());
-    // }
     if (!pushReqQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
@@ -265,6 +263,20 @@ PushEngine::handleMemResp(PacketPtr pkt)
     onTheFlyMemReqs--;
     assert(memRespQueue.size() <= memRespQueueSize);
 
+    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    std::vector<CompleteEdge> edges;
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+    }
+    edgeQueue.push_back(edges);
+    delete pkt_data;
+
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
@@ -288,6 +300,12 @@ PushEngine::processNextPushEvent()
 
     Edge* curr_edge = (Edge*) (data + offset);
 
+    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
+    while(!current_edges.empty()) {
+        CompleteEdge curr_edge = current_edges.back();
+        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
+        current_edges.pop_back();
+    }
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9b182e2251..7fb6c42579 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,19 +42,21 @@ class CoalesceEngine;
 class PushEngine : public BaseMemoryEngine
 {
   private:
-    class PushPacketInfoGen {
+    class EdgeReadInfoGen {
       private:
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
+
         uint32_t _value;
+        Addr _src;
 
       public:
-        PushPacketInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _value(value)
+        EdgeReadInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value, Addr src):
+                            _start(start), _end(end), _step(step),
+                            _atom(atom), _value(value), _src(src)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -74,8 +76,17 @@ class PushEngine : public BaseMemoryEngine
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
-        uint32_t value() { return _value; }
         bool done() { return (_start >= _end); }
+
+        Addr src() { return _src; }
+        uint32_t value() { return _value; }
+    };
+
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
     };
 
     class ReqPort : public RequestPort
@@ -98,26 +109,27 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
-    Addr baseEdgeAddr;
-
     int pushReqQueueSize;
     int numTotalRetries;
     int numPendingRetries;
-    std::deque<PushPacketInfoGen> pushReqQueue;
+    std::deque<EdgeReadInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
     int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
+    std::deque<std::vector<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
@@ -167,6 +179,11 @@ class PushEngine : public BaseMemoryEngine
 
     int getNumRetries() { return numTotalRetries; }
 
+    void start(); // CoalesceEngine announcing work
+    void stop(); // CoalesceEngine announcing no work
+    bool running() { return _running; }
+    void recvWLItem2(Addr addr, WorkListItem wl);
+
 };
 
 }

From ef1606cc3c6f9bf06acea6fd0169504ab350e91b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 12 Aug 2022 08:32:42 -0700
Subject: [PATCH 143/279] Implemented pullVertex.

---
 configs/accl/sega.py                      |   7 +-
 src/accl/graph/base/data_structs.hh       |   5 +-
 src/accl/graph/sega/SConscript            |   1 +
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    |  71 +++---
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 src/accl/graph/sega/push_engine.cc        | 257 +++++++++-------------
 src/accl/graph/sega/push_engine.hh        |  52 ++---
 8 files changed, 167 insertions(+), 240 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 26488ef69d..e7a704d477 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -6,10 +6,9 @@
 from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
-    def __init__(self, base_edge_addr):
+    def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0,
-                                    push_req_queue_size=32,
+        self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
         # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
@@ -151,7 +150,7 @@ def __init__(self,
 
         mpus = []
         for i in range(num_mpus):
-            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus.append(MPU())
             mpus[i].setReqPort(self.interconnect.cpu_side_ports)
             mpus[i].setRespPort(self.interconnect.mem_side_ports)
             mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 6f775d8a38..026a3cb7b2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -94,9 +94,10 @@ struct CompleteEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
+    uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
-        src(src), dst(dst), weight(weight)
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 4c398b5ccd..ae216ccdd4 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -47,6 +47,7 @@ DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
+DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index a5d1d7e8e7..9bd1941b23 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -99,11 +99,9 @@ BaseMemoryEngine::MemPort::recvReqRetry()
             "Received retry without a blockedPacket");
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
 }
 
 PacketPtr
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21dd746aad..dcec2a5f78 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,8 +47,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0),
+    numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -423,26 +424,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "for addr %lu. It was not found in the cache.\n",
                     __func__, addr);
         WorkListItem* items = pkt->getPtr<WorkListItem>();
-        int push_needed = 0;
         // No applying of the line needed.
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) &&
-                            (items[i].degree == 0)));
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
             if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(items[i]);
+                _workCount--;
+                needsPush[it + i] = 0;
+                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                break;
             }
-            push_needed += needsPush[it + i];
-            needsPush[it + i] = 0;
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(
-                                numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        // }
         delete pkt;
         return true;
     }
@@ -691,7 +686,7 @@ CoalesceEngine::processNextApplyEvent()
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, cacheBlocks[block_index].to_string());
+            __func__, block_index, cacheBlocks[block_index].to_string());
     assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].needsApply);
     assert(!cacheBlocks[block_index].pendingData);
@@ -712,14 +707,15 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
-                if ((needsPush[bit_index_base + index] == 0) &&
-                    (cacheBlocks[block_index].items[index].degree != 0)) {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[index]);
-                    } else {
+
+                if (cacheBlocks[block_index].items[index].degree > 0) {
+                    if (needsPush[bit_index_base + index] == 0) {
+                        _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
+                    if (!peerPushEngine->running()) {
+                        peerPushEngine->start();
+                    }
                 }
             }
         }
@@ -945,24 +941,20 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            int push_needed = 0;
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-
             for (int i = 0; i < numElementsPerLine; i++) {
+                Addr vertex_addr = addr + i * sizeof(WorkListItem);
                 if (needsPush[slice_base + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i]);
+                    _workCount--;
+                    needsPush[slice_base + i] = 0;
+                    peerPushEngine->recvVertexPush(vertex_addr,
+                                            cacheBlocks[block_index].items[i]);
+                    break;
                 }
-                push_needed +=  needsPush[slice_base + i];
-                needsPush[slice_base + i] = 0;
             }
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                            numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -973,11 +965,10 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             // a flag to true (maybe not even needed just look if the cache has a
             // line allocated for it in the cacheBlocks).
         }
-        numRetriesReceived--;
-        assert(numRetriesReceived == 0);
+        numPullsReceived--;
     }
 
-    if (numRetriesReceived > 0) {
+    if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
             processNextPushRetry(slice_base, schedule_tick);
@@ -1002,29 +993,19 @@ CoalesceEngine::recvMemRetry()
 }
 
 void
-CoalesceEngine::recvPushRetry()
+CoalesceEngine::recvVertexPull()
 {
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(numRetriesReceived == 1);
-
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
         processNextPushRetry(slice_base, schedule_tick);
     }, 0, curTick());
-    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
-                                        "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
         schedule(nextMemoryEvent, nextCycle());
     }
 }
 
-
-
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce6e0daca6..6969fe2823 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,8 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<int, std::vector<Addr>> MSHR;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int numRetriesReceived;
+    int _workCount;
+    int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -161,7 +162,8 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
     void registerWLEngine(WLEngine* wl_engine);
 
-    void recvPushRetry();
+    int workCount() { return _workCount; }
+    void recvVertexPull();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d071e8fd37..b5341b3d61 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/PushEngine.hh"
+#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -38,13 +39,12 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    pushReqQueueSize(params.push_req_queue_size),
-    numTotalRetries(0), numPendingRetries(0),
-    onTheFlyMemReqs(0),
-    memRespQueueSize(params.resp_queue_size),
+    _running(false),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -66,15 +66,31 @@ PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
     numElementsPerLine = elements_per_line;
 }
 
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
+    if (nextPushEvent.pending()) {
+        nextPushEvent.wake();
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
+    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
+                "the network.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvReqRetry();
     }
 }
 
@@ -92,86 +108,73 @@ PushEngine::ReqPort::recvReqRetry()
     DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                __func__, _blocked ? "true" : "false",
-                (blockedPacket == nullptr) ? "true" : "false");
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
 void
-PushEngine::deallocatePushSpace(int space)
+PushEngine::start()
 {
-    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
-    // and or the pushReqQueue is empty. If so we might need to
-    // send retries.
-    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
-                                                __func__, space);
-    numPendingRetries--;
-    if (numTotalRetries > 0) {
-        int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
-                            "free spaces.\n", __func__, free_space);
-        if ((free_space >= numElementsPerLine) &&
-            (numPendingRetries == 0)) {
-            DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-            assert(!nextSendRetryEvent.scheduled());
-            schedule(nextSendRetryEvent, nextCycle());
-        }
+    assert(!_running);
+    assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace()) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItem(WorkListItem wl)
+PushEngine::processNextVertexPullEvent()
 {
-    assert(wl.degree != 0);
-
-    assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() < pushReqQueueSize));
-    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
-            (pushReqQueueSize != 0), "You should call this method after "
-            "checking if there is enough push space. Use allocatePushSpace.\n");
+    // TODO: change edgePointerQueueSize
+    numPendingPulls++;
+    peerCoalesceEngine->recvVertexPull();
 
-    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
-
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    if (!workLeft()) {
+        _running = false;
+    }
 
-    if ((!nextMemoryReadEvent.pending()) &&
-        (!nextMemoryReadEvent.scheduled())) {
-        schedule(nextMemoryReadEvent, nextCycle());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
 {
-    assert(wl.degree != 0);
-    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
-                                __func__, wl.to_string());
+    assert(wl.degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    assert(pushReqQueue.size() <= pushReqQueueSize);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    numPendingPulls--;
+    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
+                            __func__, addr, wl.to_string());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 
-    numTotalRetries--;
     if ((!nextMemoryReadEvent.pending()) &&
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
@@ -186,20 +189,17 @@ PushEngine::processNextMemoryReadEvent()
         return;
     }
 
-    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
         Addr aligned_addr, offset;
         int num_edges;
 
-        EdgeReadInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        reqOffsetMap[pkt->req] = offset;
-        reqNumEdgeMap[pkt->req] = num_edges;
-        reqValueMap[pkt->req] = curr_info.value();
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
 
@@ -208,42 +208,23 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
-            pushReqQueue.pop_front();
-            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                        "pushReqQueue.size() = %u.\n",
-                        __func__, pushReqQueue.size());
-            if (numTotalRetries > 0) {
-                int free_space = pushReqQueueSize -
-                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                            " free spaces.\n", __func__, free_space);
-                if ((free_space >= numElementsPerLine) &&
-                    (numPendingRetries == 0)) {
-                    DPRINTF(PushEngine, "%s: Sent a push retry to "
-                                "peerCoalesceEngine.\n", __func__);
-                    if (!nextSendRetryEvent.scheduled()) {
-                        schedule(nextSendRetryEvent, nextCycle());
-                    }
-                }
-            }
+            edgePointerQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
     }
 
-    if (!pushReqQueue.empty()) {
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
         schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
-void
-PushEngine::processNextSendRetryEvent()
-{
-    assert(numPendingRetries == 0);
-    numPendingRetries++;
-    peerCoalesceEngine->recvPushRetry();
-}
-
 void
 PushEngine::recvMemRetry()
 {
@@ -259,25 +240,27 @@ PushEngine::handleMemResp(PacketPtr pkt)
 {
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
-    memRespQueue.push_back(pkt);
-    onTheFlyMemReqs--;
-    assert(memRespQueue.size() <= memRespQueueSize);
 
     uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::vector<CompleteEdge> edges;
+    std::deque<CompleteEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+        edges.emplace_back(push_info.src, edge_dst,
+                    edge_weight, push_info.value);
     }
     edgeQueue.push_back(edges);
+    onTheFlyMemReqs--;
+    reqInfoMap.erase(pkt->req);
     delete pkt_data;
+    delete pkt;
 
-    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+    if ((!nextPushEvent.pending()) &&
+        (!nextPushEvent.scheduled())) {
         schedule(nextPushEvent, nextCycle());
     }
     return true;
@@ -287,50 +270,37 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < peerMemoryAtomSize);
-    uint32_t value = reqValueMap[pkt->req];
+    if (reqPort.blocked()) {
+        nextPushEvent.sleep();
+        return;
+    }
 
-    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
-                "offset: %lu\n",
-            __func__, pkt->getAddr(), offset);
+    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
+    CompleteEdge curr_edge = edge_list.front();
 
-    Edge* curr_edge = (Edge*) (data + offset);
+    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                    __func__, curr_edge.to_string());
 
-    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
-    while(!current_edges.empty()) {
-        CompleteEdge curr_edge = current_edges.back();
-        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
-        current_edges.pop_back();
-    }
     // TODO: Implement propagate function here
-    uint32_t update_value = value + 1;
+    uint32_t update_value = curr_edge.value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge->neighbor, update_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(update);
-        stats.numUpdates++;
-        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
-        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
-        reqNumEdgeMap[pkt->req]--;
-        assert(reqNumEdgeMap[pkt->req] >= 0);
-    }
+                            curr_edge.dst, update_value);
+
+    reqPort.sendPacket(update);
+    stats.numUpdates++;
+    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
+                        "with value: %d.\n", __func__, curr_edge.src,
+                        curr_edge.dst, update_value);
+
 
-    if (reqNumEdgeMap[pkt->req] == 0) {
-        reqOffsetMap.erase(pkt->req);
-        reqNumEdgeMap.erase(pkt->req);
-        reqValueMap.erase(pkt->req);
-        memRespQueue.pop_front();
-        delete pkt;
+    edge_list.pop_front();
+    if (edge_list.empty()) {
+        edgeQueue.pop_front();
     }
 
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+    assert(!nextPushEvent.pending());
+    assert(!nextPushEvent.scheduled());
+    if (!edgeQueue.empty()) {
         schedule(nextPushEvent, nextCycle());
     }
 }
@@ -354,17 +324,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-bool
-PushEngine::allocatePushSpace() {
-    if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
-        return true;
-    } else {
-        numTotalRetries++;
-        return false;
-    }
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7fb6c42579..c79b0de944 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -49,14 +49,14 @@ class PushEngine : public BaseMemoryEngine
         size_t _step;
         size_t _atom;
 
-        uint32_t _value;
         Addr _src;
+        uint32_t _value;
 
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value, Addr src):
+                            size_t atom, Addr src, uint32_t value):
                             _start(start), _end(end), _step(step),
-                            _atom(atom), _value(value), _src(src)
+                            _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -109,38 +109,34 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    ReqPort reqPort;
+
     bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
-    ReqPort reqPort;
-
-    int pushReqQueueSize;
-    int numTotalRetries;
-    int numPendingRetries;
-    std::deque<EdgeReadInfoGen> pushReqQueue;
-
-    // TODO: Add size one size for all these maps
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<EdgeReadInfoGen> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
-    int memRespQueueSize;
-    std::deque<PacketPtr> memRespQueue;
-    std::deque<std::vector<CompleteEdge>> edgeQueue;
+    int edgeQueueSize;
+    std::deque<std::deque<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    EventFunctionWrapper nextPushEvent;
+    MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    bool vertexSpace();
+    bool workLeft();
 
     struct PushStats : public statistics::Group
     {
@@ -166,24 +162,14 @@ class PushEngine : public BaseMemoryEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace();
-
-    void deallocatePushSpace(int space);
-
-    void recvWLItem(WorkListItem wl);
-
-    void recvWLItemRetry(WorkListItem wl);
-
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
 
-    int getNumRetries() { return numTotalRetries; }
+    void recvReqRetry();
 
-    void start(); // CoalesceEngine announcing work
-    void stop(); // CoalesceEngine announcing no work
+    void start();
     bool running() { return _running; }
-    void recvWLItem2(Addr addr, WorkListItem wl);
-
+    void recvVertexPush(Addr addr, WorkListItem wl);
 };
 
 }

From b734fb89aea9aace4afe97936e55743568f63e2e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 22 Aug 2022 11:51:06 -0700
Subject: [PATCH 144/279] Added sim exit functionality. WIP

---
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  7 +++++++
 src/accl/graph/sega/coalesce_engine.hh     |  2 ++
 src/accl/graph/sega/push_engine.cc         | 11 +++++++++++
 src/accl/graph/sega/push_engine.hh         |  8 +++++---
 src/accl/graph/sega/wl_engine.cc           |  6 ++++++
 src/accl/graph/sega/wl_engine.hh           |  3 ++-
 7 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 102800de92..1f325703bd 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -52,7 +52,7 @@ class CenteralController : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-        // virtual AddrRangeList getAddrRanges() const;
+
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dcec2a5f78..57bc99013c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,6 +85,13 @@ CoalesceEngine::drain()
     return DrainState::Drained;
 }
 
+bool
+CoalesceEngine::done()
+{
+    return needsPush.none() &&
+        memoryFunctionQueue.empty() && peerWLEngine->done();
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6969fe2823..b19a1bc461 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,6 +164,8 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int workCount() { return _workCount; }
     void recvVertexPull();
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index b5341b3d61..9866c30f5c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -32,6 +32,7 @@
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -126,6 +127,12 @@ PushEngine::workLeft()
     return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
+bool
+PushEngine::done()
+{
+    return edgeQueue.empty() &&
+        edgePointerQueue.empty() && peerCoalesceEngine->done();
+}
 void
 PushEngine::start()
 {
@@ -298,6 +305,10 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
+    if (done()) {
+        exitSimLoopNow(name() + " is done.");
+    }
+
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c79b0de944..a42228f4c0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -126,6 +126,9 @@ class PushEngine : public BaseMemoryEngine
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    bool vertexSpace();
+    bool workLeft();
+
     EventFunctionWrapper nextVertexPullEvent;
     void processNextVertexPullEvent();
 
@@ -135,9 +138,6 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    bool vertexSpace();
-    bool workLeft();
-
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -170,6 +170,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 12f4548aa2..e999667ad1 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,12 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
 // TODO: Parameterize the number of pops WLEngine can do at a time.
 // TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5e8e5b25f3..1360d37132 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -80,7 +80,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
-
     AddrRangeList getAddrRanges() const;
 
     EventFunctionWrapper nextReadEvent;
@@ -116,6 +115,8 @@ class WLEngine : public BaseReduceEngine
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
     int getRegisterFileSize() { return registerFileSize; }
+
+    bool done();
 };
 
 }

From 3ab86663cf3cd540c5c6964a00ccaf61a09b2228 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 26 Aug 2022 09:54:35 -0700
Subject: [PATCH 145/279] Adding a DDR model to the accelerator

---
 configs/accl/sega.py   | 45 +++++++++++++++++++++++++++++-------------
 src/base/statistics.hh |  2 +-
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e7a704d477..28f9211045 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,6 +2,7 @@
 import argparse
 
 from math import log
+import math
 from m5.objects import *
 from m5.util.convert import toMemorySize
 
@@ -18,7 +19,7 @@ def __init__(self):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="8MiB",
+                                    cache_size="16MiB",
                                     num_mshr_entry=32,
                                     num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
@@ -61,7 +62,7 @@ def __init__(self,
 
         self._edge_chunk_size = int(\
                                 toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(\
+        self._edge_ranges = [AddrRange(
                             start=toMemorySize(vertex_memory_size)+\
                             self._edge_chunk_size*i,\
                             size=self._edge_chunk_size)\
@@ -69,23 +70,39 @@ def __init__(self,
 
         vertex_mem_ctrl = []
         edge_mem_ctrl = []
+        # vertex_mem_ranges = self._vertex_ranges
+        
+
         for i in range(num_channels):
+            # vertex_addr_range = vertex_mem_ranges[i]
+            vertex_interface = DDR4_2400_8x8()
+            vertex_interface.range = self._vertex_ranges[i]
+            ctrl = MemCtrl()
+            ctrl.dram = vertex_interface
             vertex_mem_ctrl.append(
-                SimpleMemory(range=self._vertex_ranges[i],
-                            bandwidth="19.2GB/s",
-                            latency="30ns")
+                ctrl
             )
+
+            edge_interface = DDR4_2400_8x8(
+                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
+            edge_interface.range = AddrRange(self._edge_chunk_size)
+            #                 start=toMemorySize(vertex_memory_size)+\
+            #                 self._edge_chunk_size*i,\
+            #                 size=self._edge_chunk_size)
+            # edge_addr_range = edge_mem_range[0]
+            # edge_interface.range = self._edge_chunk_size
+            edge_ctrl = MemCtrl()
+            edge_ctrl.dram = edge_interface
             edge_mem_ctrl.append(
-                # SimpleMemory(range=self._edge_ranges[i],
-                #             bandwidth="4.8GB/s",
-                #             latency="30ns",
-                #             image_file=f"{graph_path}/edgelist_{i}")
-                SimpleMemory(range=AddrRange(self._edge_chunk_size),
-                            bandwidth="4.8GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}",
-                            in_addr_map=False)
+                edge_ctrl
             )
+            # edge_mem_ctrl.append(
+            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
+            #                 bandwidth="4.8GB/s",
+            #                 latency="30ns",
+            #                 image_file=f"{graph_path}/edgelist_{i}",
+            #                 in_addr_map=False)
+            # )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
 
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 8156be5a79..22be74ec90 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1051,7 +1051,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };

From 6ecdecb43cf6b8911da5b301ae7ef4e9ed84f366 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 28 Aug 2022 21:14:54 -0700
Subject: [PATCH 146/279] Completed sim exit. I think...

---
 configs/accl/sega.py                       | 184 ++++++------------
 src/accl/graph/sega/CenteralController.py  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/MPU.py                 |  47 +++++
 src/accl/graph/sega/PushEngine.py          |   2 -
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/WLEngine.py            |   3 -
 src/accl/graph/sega/centeral_controller.cc |  23 ++-
 src/accl/graph/sega/centeral_controller.hh |  13 +-
 src/accl/graph/sega/coalesce_engine.cc     |  78 ++++----
 src/accl/graph/sega/coalesce_engine.hh     |  11 +-
 src/accl/graph/sega/mpu.cc                 | 206 +++++++++++++++++++++
 src/accl/graph/sega/mpu.hh                 | 135 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  73 ++------
 src/accl/graph/sega/push_engine.hh         |  38 +---
 src/accl/graph/sega/wl_engine.cc           | 133 ++++---------
 src/accl/graph/sega/wl_engine.hh           |  43 +----
 17 files changed, 573 insertions(+), 427 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py
 create mode 100644 src/accl/graph/sega/mpu.cc
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 28f9211045..a0bfb5ddce 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,112 +4,8 @@
 from math import log
 import math
 from m5.objects import *
-from m5.util.convert import toMemorySize
 
-class MPU(SubSystem):
-    def __init__(self):
-        super(MPU, self).__init__()
-        self.push_engine = PushEngine(push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64)
-        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-        #                             push_req_queue_size=32,
-        #                             attached_memory_atom_size=64,
-        #                             resp_queue_size=64)
-        self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32,
-                                    cache_size="16MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=16)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                register_file_size=32)
-
-    def getRespPort(self):
-        return self.wl_engine.resp_port
-    def setRespPort(self, port):
-        self.wl_engine.resp_port = port
-
-    def getReqPort(self):
-        return self.push_engine.req_port
-    def setReqPort(self, port):
-        self.push_engine.req_port = port
-
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-class MPUMemory(SubSystem):
-    def __init__(self,
-                    num_channels: int,
-                    cache_line_size: int,
-                    vertex_memory_size: str,
-                    edge_memory_size: str,
-                    graph_path: str):
-        super(MPUMemory, self).__init__()
-
-        self._vertex_ranges = self._interleave_addresses(
-                                AddrRange(start=0, size=vertex_memory_size),\
-                                num_channels,\
-                                cache_line_size)
-
-        self._edge_chunk_size = int(\
-                                toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(
-                            start=toMemorySize(vertex_memory_size)+\
-                            self._edge_chunk_size*i,\
-                            size=self._edge_chunk_size)\
-                            for i in range(num_channels)]
-
-        vertex_mem_ctrl = []
-        edge_mem_ctrl = []
-        # vertex_mem_ranges = self._vertex_ranges
-        
-
-        for i in range(num_channels):
-            # vertex_addr_range = vertex_mem_ranges[i]
-            vertex_interface = DDR4_2400_8x8()
-            vertex_interface.range = self._vertex_ranges[i]
-            ctrl = MemCtrl()
-            ctrl.dram = vertex_interface
-            vertex_mem_ctrl.append(
-                ctrl
-            )
-
-            edge_interface = DDR4_2400_8x8(
-                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
-            edge_interface.range = AddrRange(self._edge_chunk_size)
-            #                 start=toMemorySize(vertex_memory_size)+\
-            #                 self._edge_chunk_size*i,\
-            #                 size=self._edge_chunk_size)
-            # edge_addr_range = edge_mem_range[0]
-            # edge_interface.range = self._edge_chunk_size
-            edge_ctrl = MemCtrl()
-            edge_ctrl.dram = edge_interface
-            edge_mem_ctrl.append(
-                edge_ctrl
-            )
-            # edge_mem_ctrl.append(
-            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
-            #                 bandwidth="4.8GB/s",
-            #                 latency="30ns",
-            #                 image_file=f"{graph_path}/edgelist_{i}",
-            #                 in_addr_map=False)
-            # )
-        self.vertex_mem_ctrl = vertex_mem_ctrl
-        self.edge_mem_ctrl = edge_mem_ctrl
-
-    def _interleave_addresses(self,
-                            plain_range,
-                            num_channels,
-                            cache_line_size):
+def interleave_addresses(plain_range, num_channels, cache_line_size):
         intlv_low_bit = log(cache_line_size, 2)
         intlv_bits = log(num_channels, 2)
         ret = []
@@ -123,17 +19,48 @@ def _interleave_addresses(self,
                 intlvMatch=i))
         return ret
 
-    def getVertexPort(self, i):
-        return self.vertex_mem_ctrl[i].port
-    def setVertexPort(self, port, i):
-        self.vertex_mem_ctrl[i].port = port
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(update_queue_size=64,
+                                register_file_size=32)
+        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+                                            cache_size="8MiB",
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=16)
+        self.push_engine = PushEngine(push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64)
+        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="19.2GiB/s")
+        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GiB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False)
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine)
 
-    def getEdgeBaseAddr(self, i):
-        return self._edge_ranges[i].start
-    def getEdgePort(self, i):
-        return self.edge_mem_ctrl[i].port
-    def setEdgePort(self, port, i):
-        self.edge_mem_ctrl[i].port = port
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -158,21 +85,19 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(
-                            num_mpus,
-                            self.cache_line_size,
-                            "2GiB",
-                            "14GiB",
-                            graph_path)
+        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
 
-        mpus = []
+        gpts = []
         for i in range(num_mpus):
-            mpus.append(MPU())
-            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
-            mpus[i].setRespPort(self.interconnect.mem_side_ports)
-            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
-            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
-        self.mpu = mpus
+            gpt = GPT("8GiB")
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
@@ -197,5 +122,4 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print(f"Exited simulation because {exit_event.getCause()}")
-    exit()
+    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bd2f6320a8..6f6b12ea2c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -36,7 +36,9 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
-    addr = Param.Addr("")
-    value = Param.Int(0, "")
 
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    addr = Param.Addr("The addr for the initial update")
+    value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 06c6f92750..14902ef352 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -34,9 +34,6 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
-
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..2d65be2949
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class MPU(SimObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    in_port = ResponsePort("Port to receive updates from outside")
+    out_port  = RequestPort("Port to send updates to the outside")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index a45f5d6ead..f98f22ba9d 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    req_port  = RequestPort("Port to send updates to the outside")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index ae216ccdd4..42a8d84ad5 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -30,12 +30,14 @@ Import('*')
 SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
+SimObject("MPU.py")
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
+Source("mpu.cc")
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 98089328f4..52ca031260 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,9 +34,6 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
-                                    "this WLEngine is connected to.")
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
     register_file_size = Param.Int("Number of internal registers the "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index f19c93ebac..5ce7228abb 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,10 +28,13 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -43,7 +46,12 @@ CenteralController::CenteralController
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
-{}
+{
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+    }
+}
 
 Port&
 CenteralController::getPort(const std::string &if_name, PortID idx)
@@ -143,4 +151,17 @@ CenteralController::functionalAccess(PacketPtr pkt)
     reqPort.sendFunctional(pkt);
 }
 
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+    }
+
+    if (done) {
+        exitSimLoopNow("no update left to process.");
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f325703bd..c54c4c04ef 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -29,7 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,20 +70,20 @@ class CenteralController : public ClockedObject
     Addr addr;
     uint32_t value;
 
+    std::vector<MPU*> mpuVector;
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
-
-    virtual void initState();
-    virtual void startup();
-
     void functionalAccess(PacketPtr pkt);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+    virtual void initState();
+    virtual void startup();
+
+    void recvDoneSignal();
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 57bc99013c..d791926fe1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,7 +30,7 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/wl_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/BitVector.hh"
@@ -38,16 +38,16 @@
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params),
-    peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
@@ -66,30 +66,20 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-
-    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
-
     needsPush.reset();
 }
 
 void
-CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+CoalesceEngine::registerMPU(MPU* mpu)
 {
-    peerWLEngine = wl_engine;
-}
-
-DrainState
-CoalesceEngine::drain()
-{
-    DPRINTF(CoalesceEngine, "%s: drain called.\n");
-    return DrainState::Drained;
+    owner = mpu;
 }
 
 bool
 CoalesceEngine::done()
 {
-    return needsPush.none() &&
-        memoryFunctionQueue.empty() && peerWLEngine->done();
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -153,17 +143,15 @@ CoalesceEngine::recvWLRead(Addr addr)
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -418,6 +406,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
 
@@ -439,7 +428,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (needsPush[it + i] == 1) {
                 _workCount--;
                 needsPush[it + i] = 0;
-                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                owner->recvVertexPush(vertex_addr, items[i]);
                 break;
             }
         }
@@ -492,17 +481,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             cacheBlocks[block_index].lastChangedTick = curTick();
@@ -548,18 +535,18 @@ CoalesceEngine::processNextResponseEvent()
     WorkListItem worklist_response;
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    owner->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(CoalesceEngine,
                 "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
     DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -720,8 +707,8 @@ CoalesceEngine::processNextApplyEvent()
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
-                    if (!peerPushEngine->running()) {
-                        peerPushEngine->start();
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
@@ -760,6 +747,10 @@ CoalesceEngine::processNextApplyEvent()
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -816,6 +807,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
     memPort.sendPacket(pkt);
+    onTheFlyReqs++;
 }
 
 void
@@ -845,6 +837,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
         cacheBlocks[block_index].needsWB = false;
         cacheBlocks[block_index].pendingWB = false;
 
@@ -955,7 +948,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
                 if (needsPush[slice_base + i] == 1) {
                     _workCount--;
                     needsPush[slice_base + i] = 0;
-                    peerPushEngine->recvVertexPush(vertex_addr,
+                    owner->recvVertexPush(vertex_addr,
                                             cacheBlocks[block_index].items[i]);
                     break;
                 }
@@ -967,6 +960,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
+            onTheFlyReqs++;
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b19a1bc461..03b463e570 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -33,7 +33,6 @@
 
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -43,7 +42,7 @@
 namespace gem5
 {
 
-class WLEngine;
+class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
 {
@@ -93,14 +92,13 @@ class CoalesceEngine : public BaseMemoryEngine
       bool isRetry;
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
-
-    WLEngine* peerWLEngine;
-    PushEngine* peerPushEngine;
+    MPU* owner;
 
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
 
+    int onTheFlyReqs;
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
@@ -156,11 +154,10 @@ class CoalesceEngine : public BaseMemoryEngine
   public:
     PARAMS(CoalesceEngine);
     CoalesceEngine(const Params &params);
-    virtual DrainState drain() override;
+    void registerMPU(MPU* mpu);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-    void registerWLEngine(WLEngine* wl_engine);
 
     int workCount() { return _workCount; }
     void recvVertexPull();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..7b1727587a
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    SimObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine),
+    inPort(name() + ".inPort", this),
+    outPort(name() + ".outPort", this)
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+Port&
+MPU::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_port") {
+        return inPort;
+    } else if (if_name == "out_port") {
+        return outPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+MPU::init()
+{
+    localAddrRange = getAddrRanges();
+    inPort.sendRangeChange();
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+AddrRangeList
+MPU::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+MPU::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+MPU::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+MPU::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    } else {
+        owner->recvReqRetry();
+    }
+}
+
+bool
+MPU::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvVertexPush(Addr addr, WorkListItem wl)
+{
+    pushEngine->recvVertexPush(addr, wl);
+}
+
+void
+MPU::sendPacket(PacketPtr pkt)
+{
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(pkt->getAddr());
+    }
+
+    if (found_locally) {
+        // TODO: count number of local updates
+
+    } else {
+        // TOOD: count number of remote updates
+
+    }
+
+    outPort.sendPacket(pkt);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    centeralController->recvDoneSignal();
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..edf0350caf
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "sim/sim_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public SimObject
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+        bool needSendRetryReq;
+
+      public:
+        RespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class ReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, MPU* owner) :
+          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+
+    RespPort inPort;
+    ReqPort outPort;
+
+    AddrRangeList localAddrRange;
+
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void checkRetryReq() { inPort.checkRetryReq(); }
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool blocked() { return outPort.blocked(); }
+    void sendPacket(PacketPtr pkt);
+    void recvReqRetry() { pushEngine->recvReqRetry(); }
+
+    void recvDoneSignal();
+    bool done();
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9866c30f5c..0134133cfa 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
@@ -37,9 +38,8 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const Params &params):
+PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
-    reqPort(name() + ".req_port", this),
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
@@ -49,22 +49,10 @@ PushEngine::PushEngine(const Params &params):
     stats(*this)
 {}
 
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BaseMemoryEngine::getPort(if_name, idx);
-    }
-}
-
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                    int elements_per_line)
+PushEngine::registerMPU(MPU* mpu)
 {
-    peerCoalesceEngine = coalesce_engine;
-    numElementsPerLine = elements_per_line;
+    owner = mpu;
 }
 
 void
@@ -77,43 +65,6 @@ PushEngine::recvReqRetry()
     }
 }
 
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
-                "the network.\n", __func__, pkt->print());
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
-    } else {
-        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
-        owner->recvReqRetry();
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
-
-    _blocked = false;
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -124,15 +75,17 @@ PushEngine::vertexSpace()
 bool
 PushEngine::workLeft()
 {
-    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
+    return ((owner->workCount() - numPendingPulls) > 0);
 }
 
 bool
 PushEngine::done()
 {
     return edgeQueue.empty() &&
-        edgePointerQueue.empty() && peerCoalesceEngine->done();
+            (onTheFlyMemReqs == 0) &&
+            edgePointerQueue.empty();
 }
+
 void
 PushEngine::start()
 {
@@ -152,7 +105,7 @@ PushEngine::processNextVertexPullEvent()
 {
     // TODO: change edgePointerQueueSize
     numPendingPulls++;
-    peerCoalesceEngine->recvVertexPull();
+    owner->recvVertexPull();
 
     if (!workLeft()) {
         _running = false;
@@ -277,7 +230,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    if (reqPort.blocked()) {
+    if (owner->blocked()) {
         nextPushEvent.sleep();
         return;
     }
@@ -293,7 +246,7 @@ PushEngine::processNextPushEvent()
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
-    reqPort.sendPacket(update);
+    owner->sendPacket(update);
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
@@ -305,10 +258,6 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
-    if (done()) {
-        exitSimLoopNow(name() + " is done.");
-    }
-
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a42228f4c0..6f92b62be0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -38,6 +38,7 @@ namespace gem5
 {
 
 class CoalesceEngine;
+class MPU;
 
 class PushEngine : public BaseMemoryEngine
 {
@@ -89,31 +90,9 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
     bool _running;
     int numElementsPerLine;
-    CoalesceEngine* peerCoalesceEngine;
+    MPU* owner;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -157,20 +136,15 @@ class PushEngine : public BaseMemoryEngine
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const Params &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                          int elements_per_line);
-
-    void recvReqRetry();
+    PushEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
+    void recvReqRetry();
+
     bool done();
 };
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e999667ad1..9890eeed76 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,103 +28,61 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "accl/graph/sega/mpu.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
-    respPort(name() + ".resp_port", this),
-    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{
-    coalesceEngine->registerWLEngine(this);
-}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseReduceEngine::getPort(if_name, idx);
-    }
-}
+{}
 
 void
-WLEngine::init()
+WLEngine::registerMPU(MPU* mpu)
 {
-    respPort.sendRangeChange();
+    owner = mpu;
 }
 
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-WLEngine::RespPort::checkRetryReq()
+bool
+WLEngine::done()
 {
-    if (needSendRetryReq) {
-        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
+    return registerFile.empty() && updateQueue.empty();
 }
 
 bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
-    return true;
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    coalesceEngine->recvFunctional(pkt);
-}
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return coalesceEngine->getAddrRanges();
-}
+    // delete the packet since it's not needed anymore.
+    delete pkt;
 
-bool
-WLEngine::done()
-{
-    return registerFile.empty() && updateQueue.empty();
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 // TODO: Parameterize the number of pops WLEngine can do at a time.
@@ -150,7 +108,7 @@ WLEngine::processNextReadEvent()
             // return a boolean value. It should return an integer/enum
             // to tell WLEngine why it rejected the read request. Their might
             // be things that WLEngine can do to fix head of the line blocking.
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (owner->recvWLRead(update_addr)) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -171,7 +129,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                respPort.checkRetryReq();
+                owner->checkRetryReq();
             }
         }
     } else {
@@ -194,7 +152,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        respPort.checkRetryReq();
+        owner->checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
@@ -238,7 +196,7 @@ WLEngine::processNextReduceEvent()
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        owner->recvWLWrite(addr, workListFile[addr]);
         registerFile.erase(addr);
         DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
                     "registerFile.size = %d, registerFileSize = %d\n",
@@ -248,40 +206,15 @@ WLEngine::processNextReduceEvent()
                     __func__, addr, registerFile.size(), registerFileSize);
     }
     workListFile.clear();
-}
 
-bool
-WLEngine::handleIncomingUpdate(PacketPtr pkt)
-{
-    assert(updateQueue.size() <= updateQueueSize);
-    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
-        return false;
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-
-
-    // delete the packet since it's not needed anymore.
-    delete pkt;
-
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    return true;
 }
 
 WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     : statistics::Group(&_wl),
     wl(_wl),
-
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1360d37132..4a0489b123 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,42 +34,18 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
+class MPU;
+
 class WLEngine : public BaseReduceEngine
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-        bool needSendRetryReq;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    virtual void init();
-
-    RespPort respPort;
-
-    CoalesceEngine* coalesceEngine;
+    MPU* owner;
 
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
@@ -79,9 +55,6 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    void recvFunctional(PacketPtr pkt);
-    AddrRangeList getAddrRanges() const;
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
@@ -104,18 +77,12 @@ class WLEngine : public BaseReduceEngine
 
   public:
     PARAMS(WLEngine);
-
-    WLEngine(const WLEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
+    WLEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     bool handleIncomingUpdate(PacketPtr pkt);
-
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
-    int getRegisterFileSize() { return registerFileSize; }
-
     bool done();
 };
 

From 3bb95d0cf0a32c50d06073e1f8a681413cdc5ba8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:27 -0700
Subject: [PATCH 147/279] Minor improvements in the code.

---
 src/accl/graph/sega/coalesce_engine.cc | 60 ++++++++------------------
 src/accl/graph/sega/coalesce_engine.hh |  7 ++-
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d791926fe1..ba7878be7a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -140,8 +140,9 @@ CoalesceEngine::recvWLRead(Addr addr)
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
-        responseQueue.push_back(std::make_tuple(addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset]));
+
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
@@ -434,6 +435,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
+
+        pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
     }
@@ -466,12 +469,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         delete pkt;
     }
 
-    // FIXME: Get rid of servicedIndices (maybe use an iterator)
-    std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHR[block_index].size(); i++) {
-        Addr miss_addr = MSHR[block_index][i];
+    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+        Addr miss_addr = *it;
         Addr aligned_miss_addr =
             roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -495,28 +497,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
-            // End of the said block
-            servicedIndices.push_back(i);
-            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-            //             "removal.\n", __func__, i, block_index);
+            it = MSHR[block_index].erase(it);
+        } else {
+            it++;
         }
     }
 
-    // TODO: We Can use taken instead of this
-    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
-    int bias = 0;
-    for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHR[block_index][i - bias];
-        MSHR[block_index].erase(MSHR[block_index].begin() +
-                                    servicedIndices[i] - bias);
-        bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
-                        "and is removed.\n", __func__, print_addr);
-    }
-
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -902,24 +890,8 @@ CoalesceEngine::getOptimalBitVectorSlice()
             (!cacheBlocks[block_index].pendingWB)) {
             assert(!cacheBlocks[block_index].needsApply);
             assert(!cacheBlocks[block_index].pendingData);
-            // current_score += numElementsPerLine * 2;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = true;
-            //     if (score == max_score_possible) {
-            //         break;
-            //     }
-            // }
             return std::make_tuple(true, it);
         } else if (cacheBlocks[block_index].addr != addr) {
-            // score += numElementsPerLine;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = false;
-            //     assert(score < max_score_possible);
-            // }
             return std::make_tuple(false, it);
         }
     }
@@ -928,7 +900,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -961,6 +933,8 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
+
+            pendingVertexPullReads.insert(addr);
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
@@ -972,9 +946,9 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
-            processNextPushRetry(slice_base, schedule_tick);
+            processNextVertexPull(slice_base, schedule_tick);
         }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
 }
@@ -999,7 +973,7 @@ CoalesceEngine::recvVertexPull()
     numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
-        processNextPushRetry(slice_base, schedule_tick);
+        processNextVertexPull(slice_base, schedule_tick);
     }, 0, curTick());
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 03b463e570..75c36f9c03 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,12 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
+    std::unordered_set<Addr> pendingVertexPullReads;
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextPushRetry(int slice_base, Tick schedule_tick);
-    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();

From f493fc52f5f2be494b415fd773e9846e13624e92 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:00:19 -0700
Subject: [PATCH 148/279] Added HBM as vertex memory. It doesn't exit!

---
 configs/accl/sega.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0bfb5ddce..2c44c1f7eb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,20 +20,26 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
-                                            cache_size="8MiB",
+                                            cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=16)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="19.2GiB/s")
+        
+        vertex_interface = HBM_1000_4H_1x128()
+        # vertex_interface.range = self._vertex_ranges[i]
+        ctrl = MemCtrl()
+        ctrl.dram = vertex_interface
+        self.vertex_mem_ctrl = ctrl
+        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+        #                                     latency_var="0ns",
+        #                                     bandwidth="19.2GiB/s")
         self.edge_mem_ctrl = SimpleMemory(latency="30ns",
                                         latency_var="0ns",
                                         bandwidth="19.2GiB/s",
@@ -58,7 +64,8 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
+        # self.vertex_mem_ctrl.range = vertex_range
+        self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
@@ -66,6 +73,7 @@ class SEGA(System):
     def __init__(self,
                 num_mpus,
                 vertex_cache_line_size,
+                cache_size,
                 graph_path,
                 first_addr,
                 first_value):
@@ -85,11 +93,15 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        vertex_ranges = interleave_addresses(
+                                AddrRange(start=0, size="4GiB"),\
+                                num_mpus,\
+                                vertex_cache_line_size)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB")
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -103,19 +115,21 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, \
+    print("******* ", args.cache_size)
+    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, \
+    num_mpus, vertex_cache_line_size, cache_size, \
         graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, \
+    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
                 graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 

From f580571a455e5b1b7da658339c163da73f634525 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:19 -0700
Subject: [PATCH 149/279] Adding Real memory for EM

---
 configs/accl/sega.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2c44c1f7eb..e9286deafc 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size, cache_size: str, i):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -40,11 +40,13 @@ def __init__(self, edge_memory_size, cache_size: str):
         # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
         #                                     latency_var="0ns",
         #                                     bandwidth="19.2GiB/s")
-        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="19.2GiB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False)
+        edge_interface = DDR4_2400_8x8(
+                device_size = edge_memory_size, 
+                image_file = f"{graph_path}/edgelist_{i}", 
+                in_addr_map=False)
+        edge_ctrl = MemCtrl()
+        edge_ctrl.dram = edge_interface
+        self.edge_mem_ctrl = edge_ctrl
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -67,7 +69,7 @@ def set_vertex_range(self, vertex_range):
         # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -101,7 +103,7 @@ def __init__(self,
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("8GiB", cache_size, i)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)

From 0031a84d259c53ed7aafbd4196b99e2e57553d0f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:38:00 -0700
Subject: [PATCH 150/279] Fixing style.

---
 configs/accl/sega.py | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e9286deafc..1e360676cb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str, i):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -31,18 +31,14 @@ def __init__(self, edge_memory_size, cache_size: str, i):
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        
-        vertex_interface = HBM_1000_4H_1x128()
-        # vertex_interface.range = self._vertex_ranges[i]
+
+        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
         ctrl = MemCtrl()
         ctrl.dram = vertex_interface
         self.vertex_mem_ctrl = ctrl
-        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-        #                                     latency_var="0ns",
-        #                                     bandwidth="19.2GiB/s")
+
         edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size, 
-                image_file = f"{graph_path}/edgelist_{i}", 
+                device_size = edge_memory_size,
                 in_addr_map=False)
         edge_ctrl = MemCtrl()
         edge_ctrl.dram = edge_interface
@@ -74,7 +70,6 @@ def set_edge_image(self, edge_image):
 class SEGA(System):
     def __init__(self,
                 num_mpus,
-                vertex_cache_line_size,
                 cache_size,
                 graph_path,
                 first_addr,
@@ -83,7 +78,7 @@ def __init__(self,
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = vertex_cache_line_size
+        self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
@@ -95,15 +90,14 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
         vertex_ranges = interleave_addresses(
-                                AddrRange(start=0, size="4GiB"),\
-                                num_mpus,\
-                                vertex_cache_line_size)
+                            AddrRange(start=0, size="4GiB"),
+                            num_mpus,
+                            32)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size, i)
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -116,23 +110,20 @@ def __init__(self,
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
-    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    print("******* ", args.cache_size)
-    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
+
+    return args.num_mpus, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, cache_size, \
-        graph_path, first_addr, first_value = get_inputs()
+    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
-                graph_path, first_addr, first_value)
+    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 3fb094d0b26117a1a7f94271beade6a317da93a8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:44:37 -0700
Subject: [PATCH 151/279] Khoshgelation.

---
 configs/accl/sega.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1e360676cb..b023507a39 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -22,27 +22,21 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
+        self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=16)
+                                            num_tgts_per_mshr=32)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
 
-        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
-        ctrl = MemCtrl()
-        ctrl.dram = vertex_interface
-        self.vertex_mem_ctrl = ctrl
+        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
-        edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size,
-                in_addr_map=False)
-        edge_ctrl = MemCtrl()
-        edge_ctrl.dram = edge_interface
-        self.edge_mem_ctrl = edge_ctrl
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -62,7 +56,6 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image

From 079a873e5cf26aa7e0ab5c1f880710ef4a4cac8c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 2 Sep 2022 07:47:19 -0700
Subject: [PATCH 152/279] Adding new stats.

---
 configs/accl/sega.py                   |  3 +-
 src/accl/graph/sega/CoalesceEngine.py  |  2 +
 src/accl/graph/sega/coalesce_engine.cc | 71 +++++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  8 +--
 4 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b023507a39..5cf557719f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -27,7 +27,8 @@ def __init__(self, edge_memory_size, cache_size: str):
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=32)
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 14902ef352..2cc756ff3f 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseMemoryEngine):
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ba7878be7a..1715d637f1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,6 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -141,7 +142,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset]));
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
@@ -197,6 +198,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             "cacheBlocks[%d].\n", __func__, block_index);
         }
         MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -312,6 +314,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
@@ -344,6 +347,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
@@ -382,11 +386,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
 
-            // cacheBlocks[block_index].hasConflict = true;
             // TODO: Might want to differentiate between different misses.
             stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
                             "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -481,7 +485,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
@@ -519,22 +523,36 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 void
 CoalesceEngine::processNextResponseEvent()
 {
+    int num_responses_sent = 0;
+
     Addr addr_response;
     WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    owner->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,
-                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__, worklist_response.to_string(), addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        if ((num_responses_sent >= maxRespPerCycle) ||
+            (responseQueue.empty())) {
+                break;
+        }
+    }
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -694,9 +712,9 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
-                    }
-                    if (!owner->running()) {
-                        owner->start();
+                        if (!owner->running()) {
+                            owner->start();
+                        }
                     }
                 }
             }
@@ -997,10 +1015,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections."),
-    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
-             "Number of failed apply schedules."),
-    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
-             "Number of failed evict schedules.")
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries.")
 {
 }
 
@@ -1008,6 +1026,11 @@ void
 CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
+
+    mshrEntryLength.init(64);
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 75c36f9c03..641ed327bb 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,7 +102,8 @@ class CoalesceEngine : public BaseMemoryEngine
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     int _workCount;
     int numPullsReceived;
@@ -144,8 +145,9 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
-      statistics::Scalar falseApplySchedules;
-      statistics::Scalar falseEvictSchedules;
+
+      statistics::Formula hitRate;
+      statistics::Histogram mshrEntryLength;
     };
 
     CoalesceStats stats;

From 106e4c3c84138b226d657d03fa126ed8aa065a5d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 4 Sep 2022 20:42:43 -0700
Subject: [PATCH 153/279] Fixing asserion error on busyMask.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/busyMaskErr        | 16 ++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |  7 ++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 src/accl/graph/sega/busyMaskErr

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 5cf557719f..3fa5b99b3a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1715d637f1..3ff867c274 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -162,7 +162,12 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: If a read happens on the same cycle as another operation such
+        // apply setLastChangedTick to half a cycle later so that operations
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 

From f56df58285e740fd583ed913b61835a9bcd0da4d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 5 Sep 2022 14:27:49 -0700
Subject: [PATCH 154/279] Fixing finding work in coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 90 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |  3 +-
 src/accl/graph/sega/mpu.cc             |  4 +-
 3 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 3ff867c274..7a52d29c98 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
+    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -79,6 +79,9 @@ CoalesceEngine::registerMPU(MPU* mpu)
 bool
 CoalesceEngine::done()
 {
+    bool push_none = needsPush.none();
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -885,41 +888,46 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int>
-CoalesceEngine::getOptimalBitVectorSlice()
+std::tuple<bool, int, Addr>
+CoalesceEngine::getOptimalPullAddr()
 {
-    bool hit_in_cache = false;
-    int slice_base = -1;
-
-    // int score = 0;
-    // int max_score_possible = 3 * numElementsPerLine;
-    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        // int current_score = 0;
+    int it = startSearchIndex;
+    int initial_search_index = startSearchIndex;
+    while (true) {
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
-        if (current_popcount == 0) {
-            continue;
+        if (current_popcount != 0) {
+            Addr addr = getBlockAddrFromBitIndex(it);
+            int block_index = getBlockIndex(addr);
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                return std::make_tuple(true, it, addr);
+            // Otherwise if it is in memory
+            } else if (cacheBlocks[block_index].addr != addr) {
+                if (pendingVertexPullReads.find(addr) != 
+                                                pendingVertexPullReads.end()) {
+                    startSearchIndex = 
+                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                    return std::make_tuple(true, it, addr);
+                }
+            }
         }
-        // current_score += current_popcount;
-        Addr addr = getBlockAddrFromBitIndex(it);
-        int block_index = getBlockIndex(addr);
-        // Idle state: valid && !pendingApply && !pendingWB
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].busyMask == 0) &&
-            (!cacheBlocks[block_index].pendingApply) &&
-            (!cacheBlocks[block_index].pendingWB)) {
-            assert(!cacheBlocks[block_index].needsApply);
-            assert(!cacheBlocks[block_index].pendingData);
-            return std::make_tuple(true, it);
-        } else if (cacheBlocks[block_index].addr != addr) {
-            return std::make_tuple(false, it);
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+        if (it == initial_search_index) {
+            break;
         }
     }
-
-    return std::make_tuple(hit_in_cache, slice_base);
+    // return garbage
+    return std::make_tuple(false, -1, 0); 
 }
 
 void
@@ -927,10 +935,10 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
+    Addr addr;
 
+    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
     if (slice_base != -1) {
-        Addr addr = getBlockAddrFromBitIndex(slice_base);
         int block_index = getBlockIndex(addr);
         if (hit_in_cache) {
             assert(cacheBlocks[block_index].valid);
@@ -958,10 +966,6 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
             onTheFlyReqs++;
 
             pendingVertexPullReads.insert(addr);
-            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
-            // It does not have to read this address anymore. It can simply set
-            // a flag to true (maybe not even needed just look if the cache has a
-            // line allocated for it in the cacheBlocks).
         }
         numPullsReceived--;
     }
@@ -993,14 +997,18 @@ CoalesceEngine::recvMemRetry()
 void
 CoalesceEngine::recvVertexPull()
 {
+    bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
-    memoryFunctionQueue.emplace_back(
-        [this] (int slice_base, Tick schedule_tick) {
-        processNextVertexPull(slice_base, schedule_tick);
-    }, 0, curTick());
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 641ed327bb..92c28ae11e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -107,13 +107,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int> getOptimalBitVectorSlice();
+    std::tuple<bool, int, Addr> getOptimalPullAddr();
 
     std::unordered_set<Addr> pendingVertexPullReads;
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 7b1727587a..63aa474542 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -194,7 +194,9 @@ MPU::sendPacket(PacketPtr pkt)
 void
 MPU::recvDoneSignal()
 {
-    centeralController->recvDoneSignal();
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
 }
 
 bool

From 164f423e056163605433d86081751201d1a80a78 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 6 Sep 2022 14:21:37 -0700
Subject: [PATCH 155/279] Fixing choosing work in coalesce engine.

---
 src/accl/graph/sega/SConscript         |   2 -
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  18 +-
 src/accl/graph/sega/push_engine.cc     |   3 -
 4 files changed, 194 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 42a8d84ad5..5d48b46fba 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,13 +43,11 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('BaseMemoryEngine')
-DebugFlag('BitVector')
 DebugFlag('CenteralController')
 DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
-DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a52d29c98..cf0e2872f6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,7 +33,6 @@
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -80,7 +79,7 @@ bool
 CoalesceEngine::done()
 {
     bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
                     __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
@@ -428,26 +427,23 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 (cacheBlocks[block_index].valid)));
         // We have read the address to send the wl and it is not in the
         // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
-        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                    "for addr %lu. It was not found in the cache.\n",
-                    __func__, addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            if (needsPush[it + i] == 1) {
-                _workCount--;
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
-                break;
             }
         }
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
-
         pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
@@ -720,6 +716,7 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
+                        activeBits.push_back(bit_index_base + index);
                         if (!owner->running()) {
                             owner->start();
                         }
@@ -888,19 +885,78 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int, Addr>
+// std::tuple<bool, int, Addr>
+// CoalesceEngine::getOptimalPullAddr()
+// {
+//     int it = startSearchIndex;
+//     int initial_search_index = startSearchIndex;
+//     while (true) {
+//         uint32_t current_popcount = 0;
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             current_popcount += needsPush[it + i];
+//         }
+//         if (current_popcount != 0) {
+//             Addr addr = getBlockAddrFromBitIndex(it);
+//             int block_index = getBlockIndex(addr);
+//             // Only if it is in cache and it is in idle state.
+//             if ((cacheBlocks[block_index].addr == addr) &&
+//                 (cacheBlocks[block_index].valid) &&
+//                 (cacheBlocks[block_index].busyMask == 0) &&
+//                 (!cacheBlocks[block_index].pendingApply) &&
+//                 (!cacheBlocks[block_index].pendingWB)) {
+//                 assert(!cacheBlocks[block_index].needsApply);
+//                 assert(!cacheBlocks[block_index].pendingData);
+//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                 return std::make_tuple(true, it, addr);
+//             // Otherwise if it is in memory
+//             } else if (cacheBlocks[block_index].addr != addr) {
+//                 if (pendingVertexPullReads.find(addr) !=
+//                             pendingVertexPullReads.end()) {
+//                     startSearchIndex =
+//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                     return std::make_tuple(true, it, addr);
+//                 }
+//             }
+//         }
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//         if (it == initial_search_index) {
+//             break;
+//         }
+//     }
+//     // return garbage
+//     return std::make_tuple(false, -1, 0);
+// }
+
+std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
-    int it = startSearchIndex;
-    int initial_search_index = startSearchIndex;
-    while (true) {
-        uint32_t current_popcount = 0;
-        for (int i = 0; i < numElementsPerLine; i++) {
-            current_popcount += needsPush[it + i];
-        }
-        if (current_popcount != 0) {
-            Addr addr = getBlockAddrFromBitIndex(it);
-            int block_index = getBlockIndex(addr);
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                BitStatus::PENDING_READ, addr, index_offset);
+            /*
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask = 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            */
+        } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid) &&
@@ -909,67 +965,122 @@ CoalesceEngine::getOptimalPullAddr()
                 (!cacheBlocks[block_index].pendingWB)) {
                 assert(!cacheBlocks[block_index].needsApply);
                 assert(!cacheBlocks[block_index].pendingData);
-                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                return std::make_tuple(true, it, addr);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if (cacheBlocks[block_index].addr != addr) {
-                if (pendingVertexPullReads.find(addr) != 
-                                                pendingVertexPullReads.end()) {
-                    startSearchIndex = 
-                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                    return std::make_tuple(true, it, addr);
-                }
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_MEMORY, addr, index_offset);
             }
         }
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-        if (it == initial_search_index) {
-            break;
-        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
     }
-    // return garbage
-    return std::make_tuple(false, -1, 0); 
+
+    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
+// void
+// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+// {
+//     bool hit_in_cache;
+//     int slice_base;
+//     Addr addr;
+
+//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
+//     if (slice_base != -1) {
+//         int block_index = getBlockIndex(addr);
+//         if (hit_in_cache) {
+//             assert(cacheBlocks[block_index].valid);
+//             assert(cacheBlocks[block_index].busyMask == 0);
+
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//             for (int i = 0; i < numElementsPerLine; i++) {
+//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
+//                 if (needsPush[slice_base + i] == 1) {
+//                     _workCount--;
+//                     needsPush[slice_base + i] = 0;
+//                     owner->recvVertexPush(vertex_addr,
+//                                             cacheBlocks[block_index].items[i]);
+//                     break;
+//                 }
+//             }
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//         } else {
+//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+//             SenderState* sender_state = new SenderState(true);
+//             pkt->pushSenderState(sender_state);
+//             memPort.sendPacket(pkt);
+//             onTheFlyReqs++;
+//             pendingVertexPullReads.insert(addr);
+//         }
+//         numPullsReceived--;
+//     }
+
+//     if (numPullsReceived > 0) {
+//         memoryFunctionQueue.emplace_back(
+//             [this] (int slice_base, Tick schedule_tick) {
+//             processNextVertexPull(slice_base, schedule_tick);
+//         }, 0, curTick());
+//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+//                                     "0 to memoryFunctionQueue.\n", __func__);
+//     }
+// }
+
 void
-CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    bool hit_in_cache;
-    int slice_base;
-    Addr addr;
-
-    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-    if (slice_base != -1) {
-        int block_index = getBlockIndex(addr);
-        if (hit_in_cache) {
-            assert(cacheBlocks[block_index].valid);
-            assert(cacheBlocks[block_index].busyMask == 0);
-
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                Addr vertex_addr = addr + i * sizeof(WorkListItem);
-                if (needsPush[slice_base + i] == 1) {
-                    _workCount--;
-                    needsPush[slice_base + i] = 0;
-                    owner->recvVertexPush(vertex_addr,
-                                            cacheBlocks[block_index].items[i]);
-                    break;
-                }
-            }
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-        } else {
+    BitStatus bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != BitStatus::GARBAGE) {
+        if (bit_status == BitStatus::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+        }
+        if (bit_status == BitStatus::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+            owner->recvVertexPush(
+                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+        }
+        if (bit_status == BitStatus::IN_MEMORY) {
+            Addr addr = location;
+            int index_offset = offset;
+            uint64_t send_mask = (1 << index_offset);
+            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
-
-            pendingVertexPullReads.insert(addr);
+            pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
     }
-
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 92c28ae11e..fe7c83afb2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -42,6 +42,14 @@
 namespace gem5
 {
 
+enum BitStatus
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -107,22 +115,26 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    // CLEAN: Replace with slice_base_queue
     int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int, Addr> getOptimalPullAddr();
+    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
-    std::unordered_set<Addr> pendingVertexPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0134133cfa..505d41b0b8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
-#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -129,8 +128,6 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                         peerMemoryAtomSize, addr, (uint32_t) wl.prop);
     numPendingPulls--;
-    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
-                            __func__, addr, wl.to_string());
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }

From 20a902efc24f7c993b7a396c64731612b7599a5d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 18:36:52 -0700
Subject: [PATCH 156/279] Adding support for synthetic traffic

---
 configs/accl/sega.py | 125 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 3fa5b99b3a..8e901b6e6d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,8 +1,35 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import m5
+import os
 import argparse
+import subprocess
 
 from math import log
-import math
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
@@ -103,21 +130,101 @@ def __init__(self,
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("synthetic", type=bool)
+    argparser.add_argument("--scale", type=int)
+    argparser.add_argument("--deg", type=int)
+    argparser.add_argument("--graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+
     args = argparser.parse_args()
 
-    return args.num_mpus, args.cache_size, \
-            args.graph_path, args.init_addr, args.init_value
+    if args.synthetic:
+        if (args.scale is None) or (args.deg is None):
+            raise ValueError("If synthetic is true, you should specify the"
+                        "scale of the graph by --scale [scale] and the average"
+                        "degree of the graph by --deg [average degree].")
+    else:
+        if args.graph is None:
+            raise ValueError("If synthetic is false, you should specify the "
+                        "path to graph binaries by --graph [path to graph].")
+    return args
 
 if __name__ == "__m5_main__":
-    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
-
-    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
+    input_args = get_inputs()
+
+    image_path = None
+    if input_args.synthetic:
+        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_reader = os.environ.get("GRAPH_READER")
+        graph_sorter = os.environ.get("GRAPH_SORTER")
+        if graph_gen is None:
+            raise ValueError(f"No value for $GRAPH_GEN.")
+        if graph_reader is None:
+            raise ValueError(f"No value for $GRAPH_READER.")
+        if graph_sorter is None:
+            raise ValueError(f"No value for $GRAPH_SORTER")
+
+        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
+        if not os.path.exists(graph_path):
+            print(f"{graph_path} does not exist already.")
+            os.mkdir(graph_path)
+            print(f"Created {graph_path}")
+
+        if not "graph.txt" in os.listdir(graph_path):
+            print(f"graph.txt not found in {graph_path}")
+            subprocess.run([f"{graph_gen}",
+                            f"{input_args.scale}",
+                            f"{input_args.deg}",
+                            f"{graph_path}/graph_unordered.txt"])
+            print(f"Generated a graph with scale "
+                f"{input_args.scale} and deg {input_args.deg}")
+            subprocess.run(["python",
+                            f"{graph_sorter}",
+                            f"{graph_path}/graph_unordered.txt",
+                            f"{graph_path}/graph.txt"])
+            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                    f" and saved in {graph_path}/graph.txt")
+            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+            print(f"Deleted {graph_path}/graph_unordered.txt")
+
+        if not "binaries" in os.listdir(graph_path):
+            print(f"binaries directory not found in {graph_path}")
+            os.mkdir(f"{graph_path}/binaries")
+            print(f"Created {graph_path}/binaries")
+
+        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
+            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
+            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
+
+        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
+        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
+            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
+                os.remove(delete.path)
+            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            subprocess.run([f"{graph_reader}" ,
+                            f"{graph_path}/graph.txt",
+                            "false",
+                            f"{input_args.num_gpts}",
+                            f"{input_args.vertex_cache_line_size}",
+                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
+            print(f"Created the graph binaries in "
+                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
+    else:
+        image_path = input_args.graph
+
+    system = SEGA(input_args.num_gpts,
+                input_args.cache_size,
+                image_path,
+                input_args.init_addr,
+                input_args.init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 3a926e4edb31f15c4d271f75ae1261c26c73e341 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 23:42:01 -0700
Subject: [PATCH 157/279] Adding workload as a parameter

---
 configs/accl/sega.py               |  2 +-
 src/accl/graph/sega/PushEngine.py  |  2 ++
 src/accl/graph/sega/WLEngine.py    |  2 ++
 src/accl/graph/sega/push_engine.cc | 17 ++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  3 ++-
 src/accl/graph/sega/wl_engine.cc   | 19 +++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh   |  5 +++++
 7 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e901b6e6d..ddeae34e4e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -215,7 +215,7 @@ def get_inputs():
                             f"{input_args.vertex_cache_line_size}",
                             f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
             print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
         image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
     else:
         image_path = input_args.graph
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index f98f22ba9d..ad9ddfefcf 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -41,3 +41,5 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 52ca031260..a44352ab9b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -40,3 +40,5 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.") # 4 is arbitrary
+
+    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 505d41b0b8..9f13c00397 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,6 +42,7 @@ PushEngine::PushEngine(const Params& params):
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -85,6 +86,20 @@ PushEngine::done()
             edgePointerQueue.empty();
 }
 
+
+uint32_t
+PushEngine::propagate(uint32_t value, uint32_t weight)
+{
+    uint32_t update;
+    if (workload == "BFS")  {
+        update = value + 1;
+    }
+    else{
+        panic("The workload %s is not supported", workload);
+    }
+    return update;
+}
+
 void
 PushEngine::start()
 {
@@ -239,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = curr_edge.value + 1;
+    uint32_t update_value = propagate(value, 1);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6f92b62be0..a64a5b1f5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -82,7 +82,6 @@ class PushEngine : public BaseMemoryEngine
         Addr src() { return _src; }
         uint32_t value() { return _value; }
     };
-
     struct PushInfo {
         Addr src;
         uint32_t value;
@@ -103,6 +102,8 @@ class PushEngine : public BaseMemoryEngine
     int edgeQueueSize;
     std::deque<std::deque<CompleteEdge>> edgeQueue;
 
+    std::string workload;
+    uint32_t propagate(uint32_t value, uint32_t weight);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9890eeed76..855e36b413 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,6 +41,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
+    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -58,6 +59,18 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
+uint32_t
+WLEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -138,7 +151,8 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                std::min(update_value, registerFile[update_addr]);
+                    reduce(update_value, registerFile[update_addr]);
+                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -191,7 +205,8 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    std::min(update_value, workListFile[addr].tempProp);
+                    reduce(update_value, workListFile[addr].tempProp);
+                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4a0489b123..b03a3cdb87 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,6 +47,8 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
+
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
@@ -55,6 +57,9 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From ba6cd3df40cd26bf84a0891801ee923ba5281896 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 7 Sep 2022 13:22:40 -0700
Subject: [PATCH 158/279] Adding workload as a parameter to coalesce engine.

---
 src/accl/graph/sega/CoalesceEngine.py  |   5 ++
 src/accl/graph/sega/coalesce_engine.cc | 120 ++++---------------------
 src/accl/graph/sega/coalesce_engine.hh |   5 +-
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 -
 5 files changed, 28 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 2cc756ff3f..f6e997f1e3 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,12 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
     num_mshr_entry = Param.Int("Number of MSHR entries.")
+
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index cf0e2872f6..a80d629737 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
+    _workCount(0), numPullsReceived(0), workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -85,6 +85,18 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
+uint32_t
+CoalesceEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -700,8 +712,12 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = std::min(current_prop,
-                            cacheBlocks[block_index].items[index].tempProp);
+            // NOTE: It might be the case that for workloads other than BFS,
+            // the reduce function here should be different to the reduce
+            // function defined in WLEngine. Think about the case of PR in
+            // detail.
+            uint32_t new_prop = reduce(
+                cacheBlocks[block_index].items[index].tempProp, current_prop);
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
@@ -885,48 +901,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-// std::tuple<bool, int, Addr>
-// CoalesceEngine::getOptimalPullAddr()
-// {
-//     int it = startSearchIndex;
-//     int initial_search_index = startSearchIndex;
-//     while (true) {
-//         uint32_t current_popcount = 0;
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             current_popcount += needsPush[it + i];
-//         }
-//         if (current_popcount != 0) {
-//             Addr addr = getBlockAddrFromBitIndex(it);
-//             int block_index = getBlockIndex(addr);
-//             // Only if it is in cache and it is in idle state.
-//             if ((cacheBlocks[block_index].addr == addr) &&
-//                 (cacheBlocks[block_index].valid) &&
-//                 (cacheBlocks[block_index].busyMask == 0) &&
-//                 (!cacheBlocks[block_index].pendingApply) &&
-//                 (!cacheBlocks[block_index].pendingWB)) {
-//                 assert(!cacheBlocks[block_index].needsApply);
-//                 assert(!cacheBlocks[block_index].pendingData);
-//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                 return std::make_tuple(true, it, addr);
-//             // Otherwise if it is in memory
-//             } else if (cacheBlocks[block_index].addr != addr) {
-//                 if (pendingVertexPullReads.find(addr) !=
-//                             pendingVertexPullReads.end()) {
-//                     startSearchIndex =
-//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                     return std::make_tuple(true, it, addr);
-//                 }
-//             }
-//         }
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//         if (it == initial_search_index) {
-//             break;
-//         }
-//     }
-//     // return garbage
-//     return std::make_tuple(false, -1, 0);
-// }
-
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -949,13 +923,6 @@ CoalesceEngine::getOptimalPullAddr()
             activeBits.pop_front();
             return std::make_tuple(
                                 BitStatus::PENDING_READ, addr, index_offset);
-            /*
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask = 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            */
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -983,55 +950,6 @@ CoalesceEngine::getOptimalPullAddr()
     return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
-// void
-// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
-// {
-//     bool hit_in_cache;
-//     int slice_base;
-//     Addr addr;
-
-//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-//     if (slice_base != -1) {
-//         int block_index = getBlockIndex(addr);
-//         if (hit_in_cache) {
-//             assert(cacheBlocks[block_index].valid);
-//             assert(cacheBlocks[block_index].busyMask == 0);
-
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//             for (int i = 0; i < numElementsPerLine; i++) {
-//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
-//                 if (needsPush[slice_base + i] == 1) {
-//                     _workCount--;
-//                     needsPush[slice_base + i] = 0;
-//                     owner->recvVertexPush(vertex_addr,
-//                                             cacheBlocks[block_index].items[i]);
-//                     break;
-//                 }
-//             }
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//         } else {
-//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-//             SenderState* sender_state = new SenderState(true);
-//             pkt->pushSenderState(sender_state);
-//             memPort.sendPacket(pkt);
-//             onTheFlyReqs++;
-//             pendingVertexPullReads.insert(addr);
-//         }
-//         numPullsReceived--;
-//     }
-
-//     if (numPullsReceived > 0) {
-//         memoryFunctionQueue.emplace_back(
-//             [this] (int slice_base, Tick schedule_tick) {
-//             processNextVertexPull(slice_base, schedule_tick);
-//         }, 0, curTick());
-//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-//                                     "0 to memoryFunctionQueue.\n", __func__);
-//     }
-// }
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index fe7c83afb2..7503d69b76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -115,8 +115,6 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
-    // CLEAN: Replace with slice_base_queue
-    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
@@ -130,6 +128,9 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9f13c00397..625f836561 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -254,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = propagate(value, 1);
+    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 855e36b413..5465769cff 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,7 +152,6 @@ WLEngine::processNextReadEvent()
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
                     reduce(update_value, registerFile[update_addr]);
-                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -206,7 +205,6 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
                     reduce(update_value, workListFile[addr].tempProp);
-                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;

From a2929701d684de4a0e0ea767e8b1b756d0bcd0a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 8 Sep 2022 10:20:48 -0700
Subject: [PATCH 159/279] Adding stats.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 +++-
 src/accl/graph/sega/push_engine.cc     |  7 ++++++-
 src/accl/graph/sega/push_engine.hh     |  2 ++
 src/accl/graph/sega/wl_engine.cc       |  9 ++++++++-
 src/accl/graph/sega/wl_engine.hh       |  1 +
 7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ddeae34e4e..e8d76e7dad 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -159,7 +159,7 @@ def get_inputs():
     image_path = None
     if input_args.synthetic:
         base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_gen = os.environ.get("GRAPH_GEN")
         graph_reader = os.environ.get("GRAPH_READER")
         graph_sorter = os.environ.get("GRAPH_SORTER")
         if graph_gen is None:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a80d629737..dbe5e56f2d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -210,7 +210,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                         "cacheBlocks[%d]. Rejecting request.\n",
                                         __func__, block_index);
-            stats.readRejections++;
+            stats.mshrTargetShortage++;
             return false;
         } else {
             DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
@@ -241,7 +241,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                 "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
-                stats.readRejections++;
+                stats.mshrEntryShortage++;
                 return false;
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
@@ -399,7 +399,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
                                             __func__, block_index);
-                stats.readRejections++;
+                stats.mshrTargetShortage++;
                 return false;
             }
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
@@ -740,6 +740,8 @@ CoalesceEngine::processNextApplyEvent()
                 }
             }
         }
+        stats.bitvectorLength.sample(needsPush.count());
+
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
@@ -1055,12 +1057,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries.")
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector")
 {
 }
 
@@ -1069,7 +1075,8 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(64);
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7503d69b76..16c417fc60 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -158,10 +158,12 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
-      statistics::Scalar readRejections;
+      statistics::Scalar mshrEntryShortage;
+      statistics::Scalar mshrTargetShortage;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;
+      statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 625f836561..855d666989 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -300,7 +300,10 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates.")
+             "Number of sent updates."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second.")
 {
 }
 
@@ -308,6 +311,8 @@ void
 PushEngine::PushStats::regStats()
 {
     using namespace statistics;
+
+    TEPS = numUpdates / simSeconds;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a64a5b1f5b..a5677067b8 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,8 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+
+      statistics::Formula TEPS;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5465769cff..a39905037e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -144,6 +144,10 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
             }
+        } else {
+            DPRINTF(WLEngine, "%s: There are no free registers "
+                    "available in the registerFile.\n", __func__);
+            stats.registerShortage++;
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -231,7 +235,10 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies")
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage")
 {
 }
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b03a3cdb87..2956e58666 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -76,6 +76,7 @@ class WLEngine : public BaseReduceEngine
 
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
+      statistics::Scalar registerShortage;
     };
 
     WorkListStats stats;

From 2c7a9dd77dd3f63c5a8c804e2e49bdcfae1e5f31 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 11 Sep 2022 14:39:42 -0700
Subject: [PATCH 160/279] Separating graph generation from run script.

---
 configs/accl/graph-gen.py | 103 ++++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py      |  96 +++--------------------------------
 2 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 configs/accl/graph-gen.py

diff --git a/configs/accl/graph-gen.py b/configs/accl/graph-gen.py
new file mode 100644
index 0000000000..16985b3537
--- /dev/null
+++ b/configs/accl/graph-gen.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
+    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.scale, args.deg, args.num_gpts
+
+if __name__ == "__main__":
+    scale, deg, num_gpts = get_inputs()
+
+    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+    graph_gen = os.environ.get("GRAPH_GEN")
+    graph_reader = os.environ.get("GRAPH_READER")
+    graph_sorter = os.environ.get("GRAPH_SORTER")
+    if graph_gen is None:
+        raise ValueError(f"No value for $GRAPH_GEN.")
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER")
+
+    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
+    if not os.path.exists(graph_path):
+        print(f"{graph_path} does not exist already.")
+        os.mkdir(graph_path)
+        print(f"Created {graph_path}")
+
+    if not "graph.txt" in os.listdir(graph_path):
+        print(f"graph.txt not found in {graph_path}")
+        for delete in os.scandir(graph_path):
+            os.remove(delete.path)
+        print(f"Deleted everything in {graph_path}")
+        subprocess.run([f"{graph_gen}",
+                        f"{scale}",
+                        f"{deg}",
+                        f"{graph_path}/graph_unordered.txt"])
+        print(f"Generated a graph with scale "
+            f"{scale} and deg {deg}")
+        subprocess.run(["python",
+                        f"{graph_sorter}",
+                        f"{graph_path}/graph_unordered.txt",
+                        f"{graph_path}/graph.txt"])
+        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+        print(f"Deleted {graph_path}/graph_unordered.txt")
+
+    if not "binaries" in os.listdir(graph_path):
+        print(f"binaries directory not found in {graph_path}")
+        os.mkdir(f"{graph_path}/binaries")
+        print(f"Created {graph_path}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
+        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}/graph.txt",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_path}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_path}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e8d76e7dad..10f7ea2b48 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -25,9 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import m5
-import os
 import argparse
-import subprocess
 
 from math import log
 from m5.objects import *
@@ -49,7 +47,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=32,
+        self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
@@ -132,99 +130,19 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("vertex_cache_line_size", type=int)
-    argparser.add_argument("synthetic", type=bool)
-    argparser.add_argument("--scale", type=int)
-    argparser.add_argument("--deg", type=int)
-    argparser.add_argument("--graph", type=str)
+    argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
 
     args = argparser.parse_args()
 
-    if args.synthetic:
-        if (args.scale is None) or (args.deg is None):
-            raise ValueError("If synthetic is true, you should specify the"
-                        "scale of the graph by --scale [scale] and the average"
-                        "degree of the graph by --deg [average degree].")
-    else:
-        if args.graph is None:
-            raise ValueError("If synthetic is false, you should specify the "
-                        "path to graph binaries by --graph [path to graph].")
-    return args
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    input_args = get_inputs()
-
-    image_path = None
-    if input_args.synthetic:
-        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.environ.get("GRAPH_GEN")
-        graph_reader = os.environ.get("GRAPH_READER")
-        graph_sorter = os.environ.get("GRAPH_SORTER")
-        if graph_gen is None:
-            raise ValueError(f"No value for $GRAPH_GEN.")
-        if graph_reader is None:
-            raise ValueError(f"No value for $GRAPH_READER.")
-        if graph_sorter is None:
-            raise ValueError(f"No value for $GRAPH_SORTER")
-
-        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
-        if not os.path.exists(graph_path):
-            print(f"{graph_path} does not exist already.")
-            os.mkdir(graph_path)
-            print(f"Created {graph_path}")
-
-        if not "graph.txt" in os.listdir(graph_path):
-            print(f"graph.txt not found in {graph_path}")
-            subprocess.run([f"{graph_gen}",
-                            f"{input_args.scale}",
-                            f"{input_args.deg}",
-                            f"{graph_path}/graph_unordered.txt"])
-            print(f"Generated a graph with scale "
-                f"{input_args.scale} and deg {input_args.deg}")
-            subprocess.run(["python",
-                            f"{graph_sorter}",
-                            f"{graph_path}/graph_unordered.txt",
-                            f"{graph_path}/graph.txt"])
-            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                    f" and saved in {graph_path}/graph.txt")
-            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-            print(f"Deleted {graph_path}/graph_unordered.txt")
-
-        if not "binaries" in os.listdir(graph_path):
-            print(f"binaries directory not found in {graph_path}")
-            os.mkdir(f"{graph_path}/binaries")
-            print(f"Created {graph_path}/binaries")
-
-        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
-            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
-
-        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
-        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
-            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
-                os.remove(delete.path)
-            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            subprocess.run([f"{graph_reader}" ,
-                            f"{graph_path}/graph.txt",
-                            "false",
-                            f"{input_args.num_gpts}",
-                            f"{input_args.vertex_cache_line_size}",
-                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
-            print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
-    else:
-        image_path = input_args.graph
-
-    system = SEGA(input_args.num_gpts,
-                input_args.cache_size,
-                image_path,
-                input_args.init_addr,
-                input_args.init_value)
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 1e06db532a0da89d86659ec97d247dce54fc2ce6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 12 Sep 2022 15:25:11 -0700
Subject: [PATCH 161/279] Adding new stats.

---
 src/accl/graph/sega/coalesce_engine.cc | 13 ++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dbe5e56f2d..7646ba8862 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -834,9 +834,13 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
     memPort.sendPacket(pkt);
     onTheFlyReqs++;
+
+    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+        pendingVertexPullReads.end()) {
+        stats.numDoubleMemReads++;
+    }
 }
 
 void
@@ -1000,6 +1004,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
+    } else {
+        stats.workSearchFails++;
     }
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
@@ -1061,6 +1067,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(workSearchFails, statistics::units::Count::get(),
+             "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 16c417fc60..355eaad07d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -160,6 +160,8 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar mshrEntryShortage;
       statistics::Scalar mshrTargetShortage;
+      statistics::Scalar workSearchFails;
+      statistics::Scalar numDoubleMemReads;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;

From a82ff6d51fed32fee6811ada452102c787bfc4b9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 13 Sep 2022 21:44:54 -0700
Subject: [PATCH 162/279] Fixing sconscript style.

---
 src/accl/graph/base/SConscript |  6 ++---
 src/accl/graph/sega/SConscript | 44 +++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 0e43d1aed8..8b741abfc8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -25,8 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseReduceEngine.py')
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
-Source('base_reduce_engine.cc')
+Source("base_reduce_engine.cc")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d48b46fba..f16d025ca2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -25,30 +25,30 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseMemoryEngine.py')
-SimObject('CenteralController.py')
-SimObject('CoalesceEngine.py')
-SimObject("MPU.py")
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
 
-Source('base_memory_engine.cc')
-Source('centeral_controller.cc')
-Source('coalesce_engine.cc')
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
 Source("mpu.cc")
-Source('push_engine.cc')
-Source('wl_engine.cc')
+Source("push_engine.cc")
+Source("wl_engine.cc")
 
-DebugFlag('ApplyUpdates')
-DebugFlag('BaseMemoryEngine')
-DebugFlag('CenteralController')
-DebugFlag('CacheBlockState')
-DebugFlag('CoalesceEngine')
-DebugFlag('PushEngine')
-DebugFlag('SEGAStructureSize')
-DebugFlag('WLEngine')
+DebugFlag("ApplyUpdates")
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("WLEngine")
 
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file

From 8ef70136ab587be0c96b1ae8244b366b52e4a8d2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 15 Sep 2022 11:16:25 -0700
Subject: [PATCH 163/279] Adding stats for measuring push and pull rate.

---
 configs/accl/sega.py                   | 21 ++++++++-----
 src/accl/graph/sega/coalesce_engine.cc | 34 ++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 41 ++++++++++++++++----------
 3 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 10f7ea2b48..2a92ee1769 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False))
+                                    range=AddrRange(edge_memory_size),
+                                    in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -92,7 +92,8 @@ def __init__(self,
                 cache_size,
                 graph_path,
                 first_addr,
-                first_value):
+                first_value
+                ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -103,16 +104,20 @@ def __init__(self,
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
-                                            width=64)
+                                            width=64
+                                            )
 
         self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                    image_file=f"{graph_path}/vertices")
+                                       image_file=f"{graph_path}/vertices"
+                                        )
+
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                            AddrRange(start=0, size="4GiB"),
-                            num_mpus,
-                            32)
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7646ba8862..5f1e849660 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -454,6 +454,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 needsPush[it + i] = 0;
                 _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
@@ -990,6 +992,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             _workCount--;
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
             Addr addr = location;
@@ -1037,6 +1041,8 @@ CoalesceEngine::recvVertexPull()
     bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
 
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
     if (should_schedule) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1052,7 +1058,7 @@ CoalesceEngine::recvVertexPull()
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
-
+    lastResetTick(0),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
@@ -1072,8 +1078,22 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
@@ -1091,6 +1111,18 @@ CoalesceEngine::CoalesceStats::regStats()
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 355eaad07d..8190478a1b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -147,25 +147,36 @@ class CoalesceEngine : public BaseMemoryEngine
 
     struct CoalesceStats : public statistics::Group
     {
-      CoalesceStats(CoalesceEngine &coalesce);
+        CoalesceStats(CoalesceEngine &coalesce);
 
-      void regStats() override;
+        virtual void regStats() override;
 
-      CoalesceEngine &coalesce;
+        virtual void resetStats() override;
 
-      statistics::Scalar numVertexReads;
-      statistics::Scalar numVertexWrites;
-      statistics::Scalar readHits;
-      statistics::Scalar readMisses;
-      statistics::Scalar readHitUnderMisses;
-      statistics::Scalar mshrEntryShortage;
-      statistics::Scalar mshrTargetShortage;
-      statistics::Scalar workSearchFails;
-      statistics::Scalar numDoubleMemReads;
+        CoalesceEngine &coalesce;
 
-      statistics::Formula hitRate;
-      statistics::Histogram mshrEntryLength;
-      statistics::Histogram bitvectorLength;
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar workSearchFails;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;

From 2a6bea7e990e1de1deed20e42c6efab290643430 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 16 Sep 2022 14:18:57 -0700
Subject: [PATCH 164/279] Added FinalAnswer debugFlag and answer printing.

---
 configs/accl/sega.py                       |  8 ++--
 src/accl/graph/sega/CenteralController.py  |  4 +-
 src/accl/graph/sega/SConscript             |  2 +-
 src/accl/graph/sega/base_memory_engine.hh  |  2 +-
 src/accl/graph/sega/centeral_controller.cc | 43 ++++++++++++++++++----
 src/accl/graph/sega/centeral_controller.hh |  7 ++--
 src/accl/graph/sega/coalesce_engine.cc     | 36 ++++++++++++++----
 src/accl/graph/sega/coalesce_engine.hh     |  2 +
 src/accl/graph/sega/push_engine.hh         |  2 +
 9 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2a92ee1769..7b37742cdb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -107,9 +107,11 @@ def __init__(self,
                                             width=64
                                             )
 
-        self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                       image_file=f"{graph_path}/vertices"
-                                        )
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6f6b12ea2c..9bee76511d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -39,6 +39,6 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    addr = Param.Addr("The addr for the initial update")
-    value = Param.Int("The value for the initial update")
+    init_addr = Param.Addr("The addr for the initial update")
+    init_value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f16d025ca2..5d411be9ac 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -41,11 +41,11 @@ Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
 
-DebugFlag("ApplyUpdates")
 DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
+DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
 DebugFlag("WLEngine")
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index f336edcbf1..afe7fd0433 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -108,7 +108,7 @@ class BaseMemoryEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
 
-    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    virtual void recvFunctional(PacketPtr pkt) = 0;
 
     virtual void init() override;
 };
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 5ce7228abb..c6de1d8390 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,8 +28,6 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
-#include <iostream>
-
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -44,8 +42,7 @@ CenteralController::CenteralController
     ClockedObject(params),
     system(params.system),
     reqPort(name() + ".req_port", this),
-    addr(params.addr),
-    value(params.value)
+    maxVertexAddr(0)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -66,9 +63,9 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::initState()
 {
-    ClockedObject::initState();
+    // ClockedObject::initState();
 
-    const auto &file = params().image_file;
+    const auto& file = params().image_file;
     if (file == "")
         return;
 
@@ -77,6 +74,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
+    maxVertexAddr = image.maxAddr();
     PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
                     system->cacheLineSize());
 
@@ -86,7 +84,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
+    Addr initial_addr = params().init_addr;
+    uint32_t initial_value = params().init_value;
+    PacketPtr first_update =
+                createUpdatePacket<uint32_t>(initial_addr, initial_value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -111,6 +112,21 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC((Addr) 0);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -160,6 +176,19 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
+        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
+            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+            reqPort.sendFunctional(pkt);
+
+            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+            WorkListItem items[num_items];
+            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+
+            for (int i = 0; i < num_items; i++) {
+                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
+                                __func__, addr, i, items[i].to_string());
+            }
+        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index c54c4c04ef..bd272cf30d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,12 +68,12 @@ class CenteralController : public ClockedObject
     System* system;
     ReqPort reqPort;
 
-    Addr addr;
-    uint32_t value;
-
+    Addr maxVertexAddr;
     std::vector<MPU*> mpuVector;
+
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     void functionalAccess(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5f1e849660..59d9720148 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
-#include "debug/ApplyUpdates.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -75,12 +74,38 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        memPort.sendFunctional(pkt);
+    }
+}
+
 bool
 CoalesceEngine::done()
 {
-    bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
-                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -723,9 +748,6 @@ CoalesceEngine::processNextApplyEvent()
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
-                    __func__, cacheBlocks[block_index].addr, index,
-                    cacheBlocks[block_index].items[index].to_string());
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8190478a1b..bb6fd9d1ea 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -190,6 +190,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt);
+
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a5677067b8..b317992b2d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -142,6 +142,8 @@ class PushEngine : public BaseMemoryEngine
     PushEngine(const Params& params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);

From 59400e0b4698744dcd6d832f9c027e6353ea1316 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 18 Sep 2022 17:17:24 -0700
Subject: [PATCH 165/279] Adding stats to measure vertexReadLatency.

---
 src/accl/graph/sega/coalesce_engine.cc |  5 ++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++++++--
 src/accl/graph/sega/wl_engine.hh       |  5 +++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 59d9720148..d4102a8bca 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -810,6 +810,7 @@ void
 CoalesceEngine::processNextMemoryEvent()
 {
     if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
         nextMemoryEvent.sleep();
         return;
     }
@@ -1097,6 +1098,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by target shortage."),
     ADD_STAT(workSearchFails, statistics::units::Count::get(),
              "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
@@ -1147,4 +1150,4 @@ CoalesceEngine::CoalesceStats::resetStats()
     lastResetTick = curTick();
 }
 
-}
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index bb6fd9d1ea..967d83a531 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
         statistics::Scalar workSearchFails;
+        statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a39905037e..b16d827dbe 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -143,6 +143,7 @@ WLEngine::processNextReadEvent()
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
+                vertexReadTime[update_addr] = curTick();
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -189,6 +190,11 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                                     wl.to_string(), workListFile.size());
+
+    stats.vertexReadLatency.sample(
+        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
     assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
@@ -238,7 +244,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
-             "stalled because of register shortage")
+             "stalled because of register shortage"),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex.")
 {
 }
 
@@ -246,6 +254,8 @@ void
 WLEngine::WorkListStats::regStats()
 {
     using namespace statistics;
-}
 
+    vertexReadLatency.init(64);
 }
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2956e58666..0c6361825e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,13 +47,12 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
-
-
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
+    std::unordered_map<Addr, Tick> vertexReadTime;
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
@@ -77,6 +76,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+
+      statistics::Histogram vertexReadLatency;
     };
 
     WorkListStats stats;

From 94752ea80e3021dbd6ef3ab3a93d88e58ef5a7d2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 19 Sep 2022 11:56:05 -0700
Subject: [PATCH 166/279] Adding a config script with simple memory

---
 configs/accl/sega-simple.py | 177 ++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py        |  48 ++++++----
 2 files changed, 206 insertions(+), 19 deletions(-)
 create mode 100644 configs/accl/sega-simple.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
new file mode 100644
index 0000000000..ae537e76ca
--- /dev/null
+++ b/configs/accl/sega-simple.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(
+                self,
+                num_mpus,
+                cache_size,
+                graph_path,
+                first_addr,
+                first_value
+                ):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '1GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64
+                                            )
+
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
+
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+
+        vertex_ranges = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("8GiB", cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7b37742cdb..8c30d10dec 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,29 +47,39 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
-                                register_file_size=32)
-        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4)
-        self.push_engine = PushEngine(push_req_queue_size=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64)
+                                    resp_queue_size=64
+                                    )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                    range=AddrRange(edge_memory_size),
-                                    in_addr_map=False))
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
-        self.mpu = MPU(wl_engine=self.wl_engine,
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine)
+                    push_engine=self.push_engine
+                    )
 
     def getRespPort(self):
         return self.mpu.in_port
@@ -87,7 +97,8 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(self,
+    def __init__(
+                self,
                 num_mpus,
                 cache_size,
                 graph_path,
@@ -101,25 +112,24 @@ def __init__(self,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(frontend_latency=1,
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64
                                             )
 
         self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
+                                    addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
-
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):

From dda4f4fb95cbefec0cc0cf46f1246a7405e1728e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 19 Sep 2022 20:27:40 -0700
Subject: [PATCH 167/279] Adding stats to count the result of bitvector search.

---
 src/accl/graph/sega/coalesce_engine.cc | 12 +++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  3 ++-
 src/accl/graph/sega/push_engine.cc     |  3 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  2 +-
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d4102a8bca..b870345d57 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1031,9 +1031,10 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
-    } else {
-        stats.workSearchFails++;
     }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1096,8 +1097,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
-    ADD_STAT(workSearchFails, statistics::units::Count::get(),
-             "Number of times coalesce engine fails to find work to push."),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1111,6 +1110,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1122,7 +1123,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector")
+             "Histogram of the length of the bitvector.")
 {
 }
 
@@ -1133,6 +1134,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
+    bitvectorSearchStatus.init(4);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 967d83a531..2b7b17d196 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,7 +164,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
-        statistics::Scalar workSearchFails;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -172,6 +171,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
 
+        statistics::Vector bitvectorSearchStatus;
+
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 855d666989..a56283cbf6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -243,6 +243,7 @@ void
 PushEngine::processNextPushEvent()
 {
     if (owner->blocked()) {
+        stats.numNetBlocks++;
         nextPushEvent.sleep();
         return;
     }
@@ -301,6 +302,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index b317992b2d..801d8e567d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+      statistics::Scalar numNetBlocks;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b16d827dbe..c6e8fda523 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -192,7 +192,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                                     wl.to_string(), workListFile.size());
 
     stats.vertexReadLatency.sample(
-        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
     vertexReadTime.erase(addr);
 
     assert(!workListFile.empty());

From 58ba502f0037dc74b2497723d461c5388ef99b21 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 12:21:46 -0700
Subject: [PATCH 168/279] Adding a stat to count number of idle cycles.

---
 src/accl/graph/sega/push_engine.cc | 6 +++++-
 src/accl/graph/sega/push_engine.hh | 6 +++---
 src/accl/graph/sega/wl_engine.hh   | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a56283cbf6..5029013acd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,7 +28,6 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
@@ -40,6 +39,7 @@ namespace gem5
 PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
     _running(false),
+    lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     workload(params.workload),
@@ -107,6 +107,7 @@ PushEngine::start()
     assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
+    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace()) {
@@ -123,6 +124,7 @@ PushEngine::processNextVertexPullEvent()
 
     if (!workLeft()) {
         _running = false;
+        lastIdleEntranceTick = curTick();
     }
 
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -304,6 +306,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of sent updates."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
+    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+             "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 801d8e567d..1f139d061e 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -37,7 +37,6 @@
 namespace gem5
 {
 
-class CoalesceEngine;
 class MPU;
 
 class PushEngine : public BaseMemoryEngine
@@ -88,10 +87,10 @@ class PushEngine : public BaseMemoryEngine
         Addr offset;
         int numElements;
     };
+    MPU* owner;
 
     bool _running;
-    int numElementsPerLine;
-    MPU* owner;
+    Tick lastIdleEntranceTick;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -128,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Scalar numUpdates;
       statistics::Scalar numNetBlocks;
+      statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0c6361825e..3d527df3cf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -77,6 +77,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
+
       statistics::Histogram vertexReadLatency;
     };
 

From 3bb5376dc9e99314f145a920a0071b434fabc1bd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 17:32:46 -0700
Subject: [PATCH 169/279] Adding stats to measure queueing latencies.

---
 configs/accl/sega.py                   |  3 ++-
 src/accl/graph/base/data_structs.hh    |  6 ++++--
 src/accl/graph/sega/coalesce_engine.cc | 17 +++++++++++++++--
 src/accl/graph/sega/coalesce_engine.hh |  5 ++++-
 src/accl/graph/sega/push_engine.cc     | 25 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     | 12 +++++++++---
 src/accl/graph/sega/wl_engine.cc       | 12 +++++++++---
 src/accl/graph/sega/wl_engine.hh       |  4 ++--
 8 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8c30d10dec..a67551a5fd 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -120,7 +120,8 @@ def __init__(
                                             )
 
         self.ctrl = CenteralController(
-                                    addr=first_addr, value=first_value,
+                                    init_addr=first_addr,
+                                    init_value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
         self.ctrl.req_port = self.interconnect.cpu_side_ports
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 026a3cb7b2..a46aaf2de9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,8 +96,10 @@ struct CompleteEdge {
     uint32_t weight;
     uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
-        src(src), dst(dst), weight(weight), value(value)
+    uint64_t entrance;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b870345d57..62cae01613 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -826,6 +826,8 @@ CoalesceEngine::processNextMemoryEvent()
         next_memory_function_tick) = memoryFunctionQueue.front();
     next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
                                 memoryFunctionQueue.size());
@@ -929,6 +931,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
+        stats.numInvalidMemFunctions++;
     }
 }
 
@@ -1110,6 +1113,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
@@ -1123,7 +1128,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector.")
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
 {
 }
 
@@ -1134,7 +1141,11 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
-    bitvectorSearchStatus.init(4);
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
@@ -1142,6 +1153,8 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    memoryFunctionLatency.init(64);
 }
 
 void
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2b7b17d196..262f75fbcf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,8 @@ enum BitStatus
     PENDING_READ,
     IN_CACHE,
     IN_MEMORY,
-    GARBAGE
+    GARBAGE,
+    NUM_STATUS
 };
 
 class MPU;
@@ -170,6 +171,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidMemFunctions;
 
         statistics::Vector bitvectorSearchStatus;
 
@@ -179,6 +181,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram memoryFunctionLatency;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5029013acd..af1c904eda 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -142,8 +142,10 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(
+                            start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr,
+                            (uint32_t) wl.prop, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -182,6 +184,9 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                                (curTick() - curr_info.entrance()) *
+                                1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
@@ -224,8 +229,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst,
-                    edge_weight, push_info.value);
+        edges.emplace_back(
+            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
     }
     edgeQueue.push_back(edges);
     onTheFlyMemReqs--;
@@ -267,7 +272,8 @@ PushEngine::processNextPushEvent()
                         "with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
 
-
+    stats.edgeQueueLatency.sample(
+        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
     edge_list.pop_front();
     if (edge_list.empty()) {
         edgeQueue.pop_front();
@@ -310,7 +316,11 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
-             "Traversed Edges Per Second.")
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgeQueue.")
 {
 }
 
@@ -320,6 +330,9 @@ PushEngine::PushStats::regStats()
     using namespace statistics;
 
     TEPS = numUpdates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgeQueueLatency.init(64);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1f139d061e..5d2277eb5a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -52,11 +52,12 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
+        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, Addr src, uint32_t value):
-                            _start(start), _end(end), _step(step),
-                            _atom(atom), _src(src), _value(value)
+                        size_t atom, Addr src, uint32_t value, Tick entrance):
+                        _start(start), _end(end), _step(step), _atom(atom),
+                        _src(src), _value(value), _entrance(entrance)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -80,6 +81,8 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
+
+        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -130,6 +133,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgeQueueLatency;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index c6e8fda523..5d4dd1723e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -79,7 +79,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
     DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
                 "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
                 __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
@@ -105,7 +105,8 @@ WLEngine::processNextReadEvent()
 {
     Addr update_addr;
     uint32_t update_value;
-    std::tie(update_addr, update_value) = updateQueue.front();
+    Tick enter_tick;
+    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
 
     DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -134,6 +135,7 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
+                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -162,6 +164,7 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
+        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -246,7 +249,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of times updates were "
              "stalled because of register shortage"),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
-             "Histogram of the latency of reading a vertex.")
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
 {
 }
 
@@ -256,6 +261,7 @@ WLEngine::WorkListStats::regStats()
     using namespace statistics;
 
     vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3d527df3cf..f888979be9 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,7 +48,7 @@ class WLEngine : public BaseReduceEngine
     MPU* owner;
 
     int updateQueueSize;
-    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
@@ -77,8 +77,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
-
       statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
     };
 
     WorkListStats stats;

From 63a69c5d79314c335110dab823540a109a039a3c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 10:52:35 -0700
Subject: [PATCH 170/279] Added pybindmethod to createInitialUpdate. merge
 added.

---
 configs/accl/sega-simple.py                |   8 +-
 configs/accl/sega-single-simple.py         | 151 ++++++++++++++++++++
 configs/accl/sega-single.py                | 155 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   8 +-
 src/accl/graph/sega/MPU.py                 |   1 +
 src/accl/graph/sega/base_memory_engine.cc  |  20 +--
 src/accl/graph/sega/centeral_controller.cc | 131 +++++------------
 src/accl/graph/sega/centeral_controller.hh |  39 ++----
 src/accl/graph/sega/coalesce_engine.cc     |  27 ++++
 src/base/addr_range.hh                     |  30 ++++
 10 files changed, 429 insertions(+), 141 deletions(-)
 create mode 100644 configs/accl/sega-single-simple.py
 create mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index ae537e76ca..e0a4fcc89e 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -65,15 +65,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="0ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s"
+                                        bandwidth="0GB/s"
                                         )
 
         self.edge_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="30ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s",
+                                        bandwidth="32GB/s",
                                         range=AddrRange(edge_memory_size),
                                         in_addr_map=False
                                         )
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
new file mode 100644
index 0000000000..a87e6c53bb
--- /dev/null
+++ b/configs/accl/sega-single-simple.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="0GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="32GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_vertex_image(self, vertex_image):
+        self.vertex_mem_ctrl.image_file = vertex_image
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_range(AddrRange("4GiB"))
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
new file mode 100644
index 0000000000..d9fe11a781
--- /dev/null
+++ b/configs/accl/sega-single.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                            latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="32GiB/s"
+                                        )
+
+        self.edge_mem_ctrl = MemCtrl(
+                                    dram=DDR4_2400_8x8(
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                    )
+                                )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        plain_vertex_range = AddrRange("4GiB")
+        self._vertex_ranges = interleave_addresses(
+                                            plain_vertex_range,
+                                            1,
+                                            32
+                                            )
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9bee76511d..0721ff977c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.ClockedObject import ClockedObject
 
 class CenteralController(ClockedObject):
@@ -35,10 +36,9 @@ class CenteralController(ClockedObject):
     cxx_class = 'gem5::CenteralController'
 
     system = Param.System(Parent.any, "System this Engine is a part of")
-    req_port  = RequestPort("Port to send updates to the outside")
+
+    image_file = Param.String("Path to the vertex image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    init_addr = Param.Addr("The addr for the initial update")
-    init_value = Param.Int("The value for the initial update")
-    image_file = Param.String("Path to the global memory image.")
+    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 2d65be2949..d80142b21e 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -45,3 +45,4 @@ class MPU(SimObject):
                                 "each instance of MPU object.")
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9bd1941b23..d9864664b1 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -59,14 +59,18 @@ void
 BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
-    // BaseMemoryEngine only supports one memory.
-    assert(memory_ranges.size() == 1);
-
-    peerMemoryRange = memory_ranges.front();
-    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
-                            "The range is %s interleaved.\n", __func__,
-                            peerMemoryRange.to_string(),
-                            peerMemoryRange.interleaved() ? "" : "not");
+
+    if (memory_ranges.size() == 2) {
+        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
+    } else if (memory_ranges.size() == 1) {
+        peerMemoryRange = memory_ranges.front();
+    } else {
+        panic("Received an unacceptable number of ranges from memory.");
+    }
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6de1d8390..68b88e9e77 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -37,12 +37,9 @@
 namespace gem5
 {
 
-CenteralController::CenteralController
-                    (const CenteralControllerParams &params):
+CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system),
-    reqPort(name() + ".req_port", this),
-    maxVertexAddr(0)
+    system(params.system)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,33 +47,35 @@ CenteralController::CenteralController
     }
 }
 
-Port&
-CenteralController::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
 void
 CenteralController::initState()
 {
-    // ClockedObject::initState();
-
+    for (auto mpu: mpuVector) {
+        addrRangeListMap[mpu] = mpu->getAddrRanges();
+    }
     const auto& file = params().image_file;
     if (file == "")
         return;
 
-    auto *object = loader::createObjectFile(file, true);
+    auto* object = loader::createObjectFile(file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
-    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
-                    system->cacheLineSize());
+    Addr maxVertexAddr = image.maxAddr();
+
+    PortProxy proxy(
+    [this](PacketPtr pkt) {
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(pkt->getAddr())) {
+                    mpu->recvFunctional(pkt);
+                    break;
+                }
+            }
+        }
+    }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 }
@@ -84,21 +83,24 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    Addr initial_addr = params().init_addr;
-    uint32_t initial_value = params().init_value;
-    PacketPtr first_update =
-                createUpdatePacket<uint32_t>(initial_addr, initial_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
+    while(!initialUpdates.empty()) {
+        PacketPtr front = initialUpdates.front();
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(front->getAddr())) {
+                    mpu->handleIncomingUpdate(front);
+                }
+            }
+        }
+        initialUpdates.pop_front();
     }
 }
 
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), addr, value);
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) value) << 2);
@@ -106,65 +108,17 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
 
-PacketPtr
-CenteralController::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC((Addr) 0);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
 void
-CenteralController::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-CenteralController::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-CenteralController::functionalAccess(PacketPtr pkt)
+CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 {
-    DPRINTF(CenteralController,
-                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    reqPort.sendFunctional(pkt);
+    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
+    initialUpdates.push_back(update);
 }
 
 void
@@ -176,19 +130,6 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
-        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
-            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
-            reqPort.sendFunctional(pkt);
-
-            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
-            WorkListItem items[num_items];
-            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
-
-            for (int i = 0; i < num_items; i++) {
-                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
-                                __func__, addr, i, items[i].to_string());
-            }
-        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index bd272cf30d..4a4e9c7cb1 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
@@ -44,46 +45,24 @@ namespace gem5
 class CenteralController : public ClockedObject
 {
   private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        CenteralController* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, CenteralController* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
-    ReqPort reqPort;
 
     Addr maxVertexAddr;
+    std::deque<PacketPtr> initialUpdates;
+
     std::vector<MPU*> mpuVector;
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
-    template<typename T> PacketPtr
-                              createUpdatePacket(Addr addr, T value);
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    void functionalAccess(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-    virtual void initState();
-    virtual void startup();
 
+    virtual void initState() override;
+    virtual void startup() override;
+
+    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62cae01613..ac62254fd6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,6 +127,15 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -136,6 +145,15 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -147,7 +165,16 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
+    // Addr upgraded_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(trimmed_addr)) {
+    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 11fb1cd668..92e45365b4 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -747,6 +748,35 @@ class AddrRange
             return AddrRange(0, 0);
         }
         return AddrRange(start, end);
+
+    friend AddrRange
+    merge(const AddrRange& left, const AddrRange& right)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        int bits_org = left.masks.size();
+        int bits_new = bits_org - 1;
+
+        int left_match = left.intlvMatch;
+        int right_match = right.intlvMatch;
+        assert(std::abs(left_match - right_match) == (1 << bits_new));
+
+        Addr last_mask = left.masks[left.masks.size() - 1];
+        int xor_high_bit_org = 0;
+        int xor_high_bit_new = 0;
+        if (!isPowerOf2(last_mask)) {
+            xor_high_bit_org = ceilLog2<Addr>(last_mask);
+            xor_high_bit_new = xor_high_bit_org - 2;
+        }
+        int intlv_high_bit_org =
+                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
+        int intlv_high_bit_new = intlv_high_bit_org - 2;
+
+        int match = std::min(left_match, right_match);
+        return AddrRange(left._start, left._end, intlv_high_bit_new,
+                            xor_high_bit_new, bits_new, match);
     }
 };
 

From 3f002a4a33ba0f96c7e158d5f6fda8a005b13924 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 12:07:38 -0700
Subject: [PATCH 171/279] Adding stat to measure response latency.

---
 configs/accl/sega-simple.py            |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 24 +++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index e0a4fcc89e..fffc273ee1 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -113,7 +113,7 @@ def __init__(
                 ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '4GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ac62254fd6..43d352da30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -274,6 +274,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -618,9 +619,16 @@ CoalesceEngine::processNextResponseEvent()
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
-        if ((num_responses_sent >= maxRespPerCycle) ||
-            (responseQueue.empty())) {
-                break;
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
         }
     }
 
@@ -1127,6 +1135,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1156,6 +1167,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
              "Histogram of the latency of processing a memory function.")
 {
@@ -1166,8 +1179,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
     bitvectorSearchStatus.init(NUM_STATUS);
     bitvectorSearchStatus.subname(0, "PENDING_READ");
     bitvectorSearchStatus.subname(1, "IN_CACHE");
@@ -1181,6 +1192,9 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 262f75fbcf..705285ba23 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -181,6 +182,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
 

From 1b5fb0cfd1beb62978cd84b4ec3b3a3547f96d4d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 17:01:13 -0700
Subject: [PATCH 172/279] Adding stats to count model inaccuracies.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++++++--
 src/accl/graph/sega/coalesce_engine.hh | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 43d352da30..0a4a041176 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -828,6 +828,8 @@ CoalesceEngine::processNextApplyEvent()
         }
         DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
     }
 
     applyQueue.pop_front();
@@ -966,7 +968,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidMemFunctions++;
+        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1151,7 +1153,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 705285ba23..b1f5b1fea1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -172,7 +172,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidMemFunctions;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
 
         statistics::Vector bitvectorSearchStatus;
 

From dbcfad00c435e515265a109fe2ad3945f11a68ca Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 15:11:01 -0700
Subject: [PATCH 173/279] style fix.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index af1c904eda..6ff1f77c45 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -335,4 +335,4 @@ PushEngine::PushStats::regStats()
     edgeQueueLatency.init(64);
 }
 
-}
+} // namespace gem5

From 884117a9e591f001e12890a6bbb2ceb65203406c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 12:37:13 -0700
Subject: [PATCH 174/279] Adding multiple queues and ports in pushEngine

---
 src/accl/graph/base/data_structs.hh | 24 +++++++-
 src/accl/graph/sega/MPU.py          |  8 ++-
 src/accl/graph/sega/mpu.cc          | 90 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/mpu.hh          | 15 ++++-
 src/accl/graph/sega/push_engine.cc  | 12 +++-
 src/accl/graph/sega/push_engine.hh  |  2 +-
 6 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index a46aaf2de9..d3db3edda5 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,7 +90,7 @@ struct __attribute__ ((packed)) Edge
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
-struct CompleteEdge {
+struct MetaEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
@@ -98,17 +98,35 @@ struct CompleteEdge {
 
     uint64_t entrance;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
         src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
     {
-        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
                                                     src, dst, weight);
     }
 };
 
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index d80142b21e..1ea6a868a9 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
 
-class MPU(SimObject):
+class MPU(ClockedObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
@@ -39,6 +39,8 @@ class MPU(SimObject):
     in_port = ResponsePort("Port to receive updates from outside")
     out_port  = RequestPort("Port to send updates to the outside")
 
+    out_ports = VectorRequestPort("Ports to remote MPUs ")
+
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -46,3 +48,5 @@ class MPU(SimObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
+    update_queue_size = Param.Int(16, "Maximum number of entries "
+                                    "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 63aa474542..8897e5a959 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,23 +29,32 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    SimObject(params),
+    ClockedObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this)
+    outPort(name() + ".outPort", this),
+    updateQueueSize(params.update_queue_size),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
+
+
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+
+        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+    }
 }
 
 Port&
@@ -55,8 +64,10 @@ MPU::getPort(const std::string& if_name, PortID idx)
         return inPort;
     } else if (if_name == "out_port") {
         return outPort;
+    } else if (if_name == "outPorts") {
+        return outports[idx];
     } else {
-        return SimObject::getPort(if_name, idx);
+        return ClockedObject::getPort(if_name, idx);
     }
 }
 
@@ -166,6 +177,79 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+bool
+MPU::enqueueUpdate(Update update)
+{
+    // Creating the packet
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+
+    for (int i = 0; i < outports.size(); i++) {
+        AddrRangeList addrList = outports[i].getAddrRanges();
+        for (auto range : addrList) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[i].size() < updateQueueSize) {
+                    updateQueues[i].emplace_back(update, curTick());
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+        }
+    }
+
+    panic("The update created does not match to any outport.");
+}
+
+template<typename T> PacketPtr
+MPU::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+MPU::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        if (outports[i].blocked()) {
+            continue;
+        }
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outports[i].sendPacket(pkt);
+        updateQueues[i].pop_front();
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index edf0350caf..d7042540f0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -36,7 +36,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "sim/sim_object.hh"
+#include "sim/clocked_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -45,7 +45,7 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public SimObject
+class MPU : public ClockedObject
 {
   private:
     class RespPort : public ResponsePort
@@ -99,6 +99,16 @@ class MPU : public SimObject
 
     AddrRangeList localAddrRange;
 
+    uint32_t updateQueueSize;
+
+    std::vector<ReqPort> outports;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
   public:
     PARAMS(MPU);
     MPU(const Params& params);
@@ -115,6 +125,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6ff1f77c45..4546ceee47 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -224,7 +224,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<CompleteEdge> edges;
+    std::deque<MetaEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
@@ -255,8 +255,8 @@ PushEngine::processNextPushEvent()
         return;
     }
 
-    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
-    CompleteEdge curr_edge = edge_list.front();
+    std::deque<MetaEdge>& edge_list = edgeQueue.front();
+    MetaEdge curr_edge = edge_list.front();
 
     DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                     __func__, curr_edge.to_string());
@@ -267,6 +267,12 @@ PushEngine::processNextPushEvent()
                             curr_edge.dst, update_value);
 
     owner->sendPacket(update);
+
+    Update update_2(curr_edge.src, curr_edge.dst, update_value);
+    (!owner->enqueueUpdate(update_2)) {
+        // edge_list.pop_front();
+        // edge_list.push_back(curr_edge);
+    }
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5d2277eb5a..d6763e3ab7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,7 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
-    std::deque<std::deque<CompleteEdge>> edgeQueue;
+    std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From 51e94750f2f46c5cccb028a4810d4e57e7b70105 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 13:20:32 -0700
Subject: [PATCH 175/279] Changing propagate function

---
 src/accl/graph/sega/PushEngine.py  |  7 ++-
 src/accl/graph/sega/push_engine.cc | 80 ++++++++++++------------------
 src/accl/graph/sega/push_engine.hh |  5 +-
 3 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index ad9ddfefcf..7dba86aff2 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -40,6 +40,9 @@ class PushEngine(BaseMemoryEngine):
     # significantly bigger than push_req_queue_size
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
-                                    "edges read from memory")
+                                    "edges read from memory.")
+    
+    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
+                                            "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload")
+    workload = Param.String("BFS", "Name of the workload.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4546ceee47..c82a4c88be 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,10 +42,11 @@ PushEngine::PushEngine(const Params& params):
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
     stats(*this)
 {}
 
@@ -55,16 +56,6 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-PushEngine::recvReqRetry()
-{
-    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
-    if (nextPushEvent.pending()) {
-        nextPushEvent.wake();
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -238,57 +229,52 @@ PushEngine::handleMemResp(PacketPtr pkt)
     delete pkt_data;
     delete pkt;
 
-    if ((!nextPushEvent.pending()) &&
-        (!nextPushEvent.scheduled())) {
-        schedule(nextPushEvent, nextCycle());
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
     }
     return true;
 }
 
 // TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
-PushEngine::processNextPushEvent()
+PushEngine::processNextPropagateEvent()
 {
-    if (owner->blocked()) {
-        stats.numNetBlocks++;
-        nextPushEvent.sleep();
-        return;
-    }
+    int num_propagates = 0;
+    while(true) {
+        std::deque<MetaEdge>& edge_list = edgeQueue.front();
+        MetaEdge curr_edge = edge_list.front();
 
-    std::deque<MetaEdge>& edge_list = edgeQueue.front();
-    MetaEdge curr_edge = edge_list.front();
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                        __func__, curr_edge.to_string());
 
-    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                    __func__, curr_edge.to_string());
+        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
 
-    // TODO: Implement propagate function here
-    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
-    PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge.dst, update_value);
-
-    owner->sendPacket(update);
-
-    Update update_2(curr_edge.src, curr_edge.dst, update_value);
-    (!owner->enqueueUpdate(update_2)) {
-        // edge_list.pop_front();
-        // edge_list.push_back(curr_edge);
-    }
-    stats.numUpdates++;
-    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
-                        "with value: %d.\n", __func__, curr_edge.src,
+        Update update(curr_edge.src, curr_edge.dst, update_value);
+        edge_list.pop_front();
+        if (owner->enqueueUpdate(update)) {
+            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
+                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
+            stats.numUpdates++;
+            stats.edgeQueueLatency.sample(
+            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+        } else {
+            edge_list.push_back(curr_edge);
+        }
 
-    stats.edgeQueueLatency.sample(
-        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
-    edge_list.pop_front();
-    if (edge_list.empty()) {
-        edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
+        }
+
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
     }
 
-    assert(!nextPushEvent.pending());
-    assert(!nextPushEvent.scheduled());
+    assert(!nextPropagateEvent.scheduled());
     if (!edgeQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
+        schedule(nextPropagateEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index d6763e3ab7..f3304a8e2a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,6 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
+    int maxPropagatesPerCycle;
     std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
@@ -117,8 +118,8 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    MemoryEvent nextPushEvent;
-    void processNextPushEvent();
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
 
     struct PushStats : public statistics::Group
     {

From e71353b1f4a64b6b2147926ba39706488328b6c3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 14:59:25 -0700
Subject: [PATCH 176/279] Pushing on Marjan's behalf, refactored out_port to
 vector-port.

---
 configs/accl/sega-single-simple.py  |  6 +-
 configs/accl/sega-single.py         |  4 +-
 src/accl/graph/base/data_structs.hh |  8 +--
 src/accl/graph/sega/MPU.py          |  3 +-
 src/accl/graph/sega/mpu.cc          | 85 +++++++++++++++--------------
 src/accl/graph/sega/mpu.hh          | 20 ++++---
 src/accl/graph/sega/push_engine.cc  | 64 +++++++++-------------
 src/accl/graph/sega/push_engine.hh  |  3 +-
 8 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index a87e6c53bb..92c1c9cbcb 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -92,10 +92,10 @@ def getRespPort(self):
     def setRespPort(self, port):
         self.mpu.in_port = port
 
-    def getReqPort(self):
-        return self.mpu.out_port
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
+    def getReqPort(self):
+        return self.mpu.out_ports
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
index d9fe11a781..e4f7942f42 100644
--- a/configs/accl/sega-single.py
+++ b/configs/accl/sega-single.py
@@ -92,9 +92,9 @@ def setRespPort(self, port):
         self.mpu.in_port = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d3db3edda5..34c8eb98ce 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,10 +96,10 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    uint64_t entrance;
-
-    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
-        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
+    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 1ea6a868a9..aad2e060d1 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -37,9 +37,8 @@ class MPU(ClockedObject):
     system = Param.System(Parent.any, "System this MPU is a part of")
 
     in_port = ResponsePort("Port to receive updates from outside")
-    out_port  = RequestPort("Port to send updates to the outside")
 
-    out_ports = VectorRequestPort("Ports to remote MPUs ")
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 8897e5a959..f86c7e02b7 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -42,7 +43,6 @@ MPU::MPU(const Params& params):
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -52,8 +52,9 @@ MPU::MPU(const Params& params):
 
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-
-        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+        outPorts.emplace_back(
+                            name() + ".outPorts" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
     }
 }
 
@@ -62,10 +63,8 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_port") {
         return inPort;
-    } else if (if_name == "out_port") {
-        return outPort;
-    } else if (if_name == "outPorts") {
-        return outports[idx];
+    } else if (if_name == "out_ports") {
+        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -76,6 +75,9 @@ MPU::init()
 {
     localAddrRange = getAddrRanges();
     inPort.sendRangeChange();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = getAddrRanges();
+    }
 }
 
 void
@@ -137,8 +139,6 @@ MPU::ReqPort::sendPacket(PacketPtr pkt)
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-    } else {
-        owner->recvReqRetry();
     }
 }
 
@@ -157,6 +157,17 @@ MPU::ReqPort::recvReqRetry()
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+MPU::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
 }
 
 bool
@@ -180,28 +191,34 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
 bool
 MPU::enqueueUpdate(Update update)
 {
-    // Creating the packet
     Addr dst_addr = update.dst;
     bool found_locally = false;
+    bool accepted = false;
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-
-    for (int i = 0; i < outports.size(); i++) {
-        AddrRangeList addrList = outports[i].getAddrRanges();
-        for (auto range : addrList) {
+    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
+                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
                 if (updateQueues[i].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
+                                        __func__, i);
                     updateQueues[i].emplace_back(update, curTick());
-                    return true;
-                } else {
-                    return false;
+                    accepted = true;
+                    break;
                 }
             }
         }
     }
 
-    panic("The update created does not match to any outport.");
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
 }
 
 template<typename T> PacketPtr
@@ -228,14 +245,19 @@ MPU::processNextUpdatePushEvent()
     int next_time_send = 0;
 
     for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
-        if (outports[i].blocked()) {
-            continue;
-        }
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outports[i].sendPacket(pkt);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
         updateQueues[i].pop_front();
         if (updateQueues[i].size() > 0) {
             next_time_send += 1;
@@ -256,25 +278,6 @@ MPU::recvVertexPush(Addr addr, WorkListItem wl)
     pushEngine->recvVertexPush(addr, wl);
 }
 
-void
-MPU::sendPacket(PacketPtr pkt)
-{
-    bool found_locally = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(pkt->getAddr());
-    }
-
-    if (found_locally) {
-        // TODO: count number of local updates
-
-    } else {
-        // TOOD: count number of remote updates
-
-    }
-
-    outPort.sendPacket(pkt);
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index d7042540f0..1a642e7873 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
+#include <unordered_map>
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
@@ -74,13 +77,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         PacketPtr blockedPacket;
+        PortID _id;
 
       public:
-        ReqPort(const std::string& name, MPU* owner) :
-          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        ReqPort(const std::string& name, MPU* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -95,15 +101,17 @@ class MPU : public ClockedObject
     PushEngine* pushEngine;
 
     RespPort inPort;
-    ReqPort outPort;
 
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
-    std::vector<ReqPort> outports;
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+
+    std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
+
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -133,9 +141,7 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    bool blocked() { return outPort.blocked(); }
-    void sendPacket(PacketPtr pkt);
-    void recvReqRetry() { pushEngine->recvReqRetry(); }
+    void recvReqRetry();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c82a4c88be..d533f1ea79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,15 +215,18 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<MetaEdge> edges;
+    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(
-            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        edges.emplace_back(meta_edge, curTick());
     }
+    assert(!edges.empty());
     edgeQueue.push_back(edges);
+
     onTheFlyMemReqs--;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
@@ -235,40 +238,44 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<MetaEdge>& edge_list = edgeQueue.front();
-        MetaEdge curr_edge = edge_list.front();
+        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = edge_list.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                        __func__, curr_edge.to_string());
-
-        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
+                                __func__, meta_edge.to_string());
 
-        Update update(curr_edge.src, curr_edge.dst, update_value);
+        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
+
         if (owner->enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
-                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
-                        curr_edge.dst, update_value);
+            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
-            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.push_back(curr_edge);
+            edge_list.emplace_back(meta_edge, entrance_tick);
         }
 
-        num_propagates++;
-        if (num_propagates >= maxPropagatesPerCycle) {
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
+
+        if (edgeQueue.empty()) {
             break;
         }
 
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
         }
     }
 
@@ -278,25 +285,6 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-template<typename T> PacketPtr
-PushEngine::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index f3304a8e2a..fed6909733 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -103,11 +103,10 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<MetaEdge>> edgeQueue;
+    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
     bool workLeft();

From ddefa3a28b20344c81272825a1c4aa245d533c78 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 30 Sep 2022 08:37:23 -0700
Subject: [PATCH 177/279] Attempting to add multi-inports to MPU

---
 configs/accl/sega-single-simple.py |  4 +--
 configs/accl/sega.py               | 49 ++++++++++++------------------
 src/accl/graph/sega/MPU.py         |  5 +--
 src/accl/graph/sega/mpu.cc         | 37 +++++++++++++---------
 src/accl/graph/sega/mpu.hh         | 13 ++++----
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index 92c1c9cbcb..eacb16d3d1 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -88,9 +88,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def setReqPort(self, port):
         self.mpu.out_ports = port
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a67551a5fd..455d081145 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -78,18 +78,19 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
+                    push_engine=self.push_engine,
+                    update_queue_size=16
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
@@ -97,14 +98,7 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -112,19 +106,7 @@ def __init__(
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
                                         AddrRange(start=0, size="4GiB"),
@@ -137,13 +119,18 @@ def __init__(
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -160,10 +147,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
     exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aad2e060d1..aea76db86f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -36,7 +36,8 @@ class MPU(ClockedObject):
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_port = ResponsePort("Port to receive updates from outside")
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
@@ -47,5 +48,5 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int(16, "Maximum number of entries "
+    update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f86c7e02b7..4a80b22979 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -42,7 +42,6 @@ MPU::MPU(const Params& params):
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
-    inPort(name() + ".inPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -53,16 +52,21 @@ MPU::MPU(const Params& params):
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".outPorts" + std::to_string(i), this, i);
+                            name() + ".out_ports" + std::to_string(i), this, i);
         updateQueues.emplace_back();
     }
+
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
 }
 
 Port&
 MPU::getPort(const std::string& if_name, PortID idx)
 {
-    if (if_name == "in_port") {
-        return inPort;
+    if (if_name == "in_ports") {
+        return inPorts[idx];
     } else if (if_name == "out_ports") {
         return outPorts[idx];
     } else {
@@ -74,9 +78,11 @@ void
 MPU::init()
 {
     localAddrRange = getAddrRanges();
-    inPort.sendRangeChange();
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
     for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = getAddrRanges();
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
     }
 }
 
@@ -101,6 +107,14 @@ MPU::RespPort::checkRetryReq()
     }
 }
 
+void
+MPU::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 MPU::RespPort::recvTimingReq(PacketPtr pkt)
 {
@@ -197,16 +211,13 @@ MPU::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
-                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
-                if (updateQueues[i].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
-                                        __func__, i);
-                    updateQueues[i].emplace_back(update, curTick());
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
                     accepted = true;
                     break;
                 }
@@ -268,8 +279,6 @@ MPU::processNextUpdatePushEvent()
     if (next_time_send > 0) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
-
-
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 1a642e7873..ff17eada0e 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -56,13 +56,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         bool needSendRetryReq;
+        PortID _id;
 
       public:
-        RespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        RespPort(const std::string& name, MPU* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        PortID id() { return _id; }
         void checkRetryReq();
 
       protected:
@@ -100,18 +103,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    RespPort inPort;
-
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
 
+    std::vector<RespPort> inPorts;
     std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -129,7 +130,6 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
-    void checkRetryReq() { inPort.checkRetryReq(); }
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -142,6 +142,7 @@ class MPU : public ClockedObject
     void recvVertexPush(Addr addr, WorkListItem wl);
 
     void recvReqRetry();
+    void checkRetryReq();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5d4dd1723e..0267bd46b6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -74,7 +74,7 @@ WLEngine::reduce(uint32_t update, uint32_t value)
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    assert(updateQueue.size() <= updateQueueSize);
+    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }

From da1584b90578fd06ae892aa4d2094863719e1821 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 09:06:36 -0700
Subject: [PATCH 178/279] Moving reqPorts from MPU to PushEngine

---
 configs/accl/sega.py               |  10 +-
 src/accl/graph/sega/MPU.py         |   4 -
 src/accl/graph/sega/PushEngine.py  |   7 +-
 src/accl/graph/sega/mpu.cc         | 136 +------------------------
 src/accl/graph/sega/mpu.hh         |  36 -------
 src/accl/graph/sega/push_engine.cc | 154 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  36 +++++++
 7 files changed, 200 insertions(+), 183 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 455d081145..21a041180f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,7 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -78,8 +79,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine,
-                    update_queue_size=16
+                    push_engine=self.push_engine
                     )
 
     def getRespPort(self):
@@ -88,9 +88,9 @@ def setRespPort(self, port):
         self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_ports
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_ports = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aea76db86f..3547cb8817 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -39,8 +39,6 @@ class MPU(ClockedObject):
     in_ports = VectorResponsePort("Incoming Ports to receive updates from "
                                                 "remote outside")
 
-    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -48,5 +46,3 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int("Maximum number of entries "
-                                    "for each update queue.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 7dba86aff2..5e0d2b3212 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,6 +34,8 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
+    workload = Param.String("BFS", "Name of the workload.")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -45,4 +47,7 @@ class PushEngine(BaseMemoryEngine):
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload.")
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4a80b22979..76d7d3114f 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -41,21 +41,12 @@ MPU::MPU(const Params& params):
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
-    pushEngine(params.push_engine),
-    updateQueueSize(params.update_queue_size),
-    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
+    pushEngine(params.push_engine)
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
 
-
-    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-        outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
-    }
-
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
         inPorts.emplace_back(
                             name() + ".in_ports" + std::to_string(i), this, i);
@@ -67,8 +58,6 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_ports") {
         return inPorts[idx];
-    } else if (if_name == "out_ports") {
-        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -77,13 +66,9 @@ MPU::getPort(const std::string& if_name, PortID idx)
 void
 MPU::init()
 {
-    localAddrRange = getAddrRanges();
     for (int i = 0; i < inPorts.size(); i++){
         inPorts[i].sendRangeChange();
     }
-    for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
-    }
 }
 
 void
@@ -144,46 +129,6 @@ MPU::RespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-MPU::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(blockedPacket != nullptr,
-            "Should never try to send if blocked!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-    }
-}
-
-bool
-MPU::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::ReqPort::recvReqRetry()
-{
-    panic_if(blockedPacket == nullptr,
-            "Received retry without a blockedPacket.");
-
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-    if (blockedPacket == nullptr) {
-        owner->recvReqRetry();
-    }
-}
-
-void
-MPU::recvReqRetry()
-{
-    if (!nextUpdatePushEvent.scheduled()) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -202,85 +147,6 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
-bool
-MPU::enqueueUpdate(Update update)
-{
-    Addr dst_addr = update.dst;
-    bool found_locally = false;
-    bool accepted = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(dst_addr);
-    }
-    for (int i = 0; i < outPorts.size(); i++) {
-        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    accepted = true;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (accepted && (!nextUpdatePushEvent.scheduled())) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-
-    return accepted;
-}
-
-template<typename T> PacketPtr
-MPU::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) 1) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-MPU::processNextUpdatePushEvent()
-{
-    int next_time_send = 0;
-
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
-            continue;
-        }
-        if (outPorts[i].blocked()) {
-            continue;
-        }
-        Update update;
-        Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
-        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outPorts[i].sendPacket(pkt);
-        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
-        if (updateQueues[i].size() > 0) {
-            next_time_send += 1;
-        }
-    }
-
-    assert(!nextUpdatePushEvent.scheduled());
-    if (next_time_send > 0) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ff17eada0e..4215f82d5b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,27 +75,6 @@ class MPU : public ClockedObject
         virtual void recvRespRetry();
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        PacketPtr blockedPacket;
-        PortID _id;
-
-      public:
-        ReqPort(const std::string& name, MPU* owner, PortID id) :
-          RequestPort(name, owner), 
-          owner(owner), blockedPacket(nullptr), _id(id)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return (blockedPacket != nullptr); }
-        PortID id() { return _id; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -103,20 +82,7 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    AddrRangeList localAddrRange;
-
-    uint32_t updateQueueSize;
-
-    std::unordered_map<PortID, AddrRangeList> portAddrMap;
-
     std::vector<RespPort> inPorts;
-    std::vector<ReqPort> outPorts;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
-
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
-
-    EventFunctionWrapper nextUpdatePushEvent;
-    void processNextUpdatePushEvent();
 
   public:
     PARAMS(MPU);
@@ -133,7 +99,6 @@ class MPU : public ClockedObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
-    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
@@ -141,7 +106,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void recvReqRetry();
     void checkRetryReq();
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d533f1ea79..70c10cc358 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -44,11 +44,40 @@ PushEngine::PushEngine(const Params& params):
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
+    updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(
+                            name() + ".out_ports" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+    }
+}
 
 void
 PushEngine::registerMPU(MPU* mpu)
@@ -56,6 +85,46 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 bool
 PushEngine::vertexSpace()
 {
@@ -255,7 +324,7 @@ PushEngine::processNextPropagateEvent()
         Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
 
-        if (owner->enqueueUpdate(update)) {
+        if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
@@ -285,6 +354,87 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+PushEngine::enqueueUpdate(Update update)
+{
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    bool accepted = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    accepted = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
+        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fed6909733..99fec33f2c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,6 +42,27 @@ class MPU;
 class PushEngine : public BaseMemoryEngine
 {
   private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
     class EdgeReadInfoGen {
       private:
         Addr _start;
@@ -95,6 +116,8 @@ class PushEngine : public BaseMemoryEngine
     bool _running;
     Tick lastIdleEntranceTick;
 
+    AddrRangeList localAddrRange;
+
     int numPendingPulls;
     int edgePointerQueueSize;
     std::deque<EdgeReadInfoGen> edgePointerQueue;
@@ -108,6 +131,13 @@ class PushEngine : public BaseMemoryEngine
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
 
+    int updateQueueSize;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Update update);
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::vector<ReqPort> outPorts;
+
     bool vertexSpace();
     bool workLeft();
 
@@ -120,6 +150,9 @@ class PushEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPropagateEvent;
     void processNextPropagateEvent();
 
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -147,6 +180,9 @@ class PushEngine : public BaseMemoryEngine
   public:
     PARAMS(PushEngine);
     PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }

From 8829d55f6ea0e1139a7c5fd1ebc2b06b54c1abc3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 10:01:32 -0700
Subject: [PATCH 179/279] Moving respPorts from MPU to WLEngine

---
 configs/accl/sega.py             |  4 +-
 src/accl/graph/sega/MPU.py       |  7 +--
 src/accl/graph/sega/WLEngine.py  |  6 ++-
 src/accl/graph/sega/mpu.cc       | 79 ++-------------------------
 src/accl/graph/sega/mpu.hh       | 39 ++------------
 src/accl/graph/sega/wl_engine.cc | 93 ++++++++++++++++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh | 34 ++++++++++++
 7 files changed, 140 insertions(+), 122 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 21a041180f..c6c2171315 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -83,9 +83,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_ports
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_ports = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 3547cb8817..8d2453b01c 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,18 +27,15 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.SimObject import SimObject
 
-class MPU(ClockedObject):
+class MPU(SimObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
-                                                "remote outside")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a44352ab9b..91325ab53f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,11 +34,15 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
+
     register_file_size = Param.Int("Number of internal registers the "
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
-                                    "entries at the same time.") # 4 is arbitrary
+                                    "entries at the same time.")
 
     workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 76d7d3114f..c8d0f636f2 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -37,7 +36,7 @@ namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    ClockedObject(params),
+    SimObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
@@ -46,30 +45,10 @@ MPU::MPU(const Params& params):
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
-
-    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
-        inPorts.emplace_back(
-                            name() + ".in_ports" + std::to_string(i), this, i);
-    }
-}
-
-Port&
-MPU::getPort(const std::string& if_name, PortID idx)
-{
-    if (if_name == "in_ports") {
-        return inPorts[idx];
-    } else {
-        return ClockedObject::getPort(if_name, idx);
-    }
 }
 
-void
-MPU::init()
-{
-    for (int i = 0; i < inPorts.size(); i++){
-        inPorts[i].sendRangeChange();
-    }
-}
+MPU::~MPU()
+{}
 
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
@@ -77,58 +56,6 @@ MPU::registerCenteralController(CenteralController* centeral_controller)
     centeralController = centeral_controller;
 }
 
-AddrRangeList
-MPU::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-MPU::RespPort::checkRetryReq()
-{
-    if (needSendRetryReq) {
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
-}
-
-void
-MPU::checkRetryReq()
-{
-    for (int i = 0; i < inPorts.size(); ++i) {
-        inPorts[i].checkRetryReq();
-    }
-}
-
-bool
-MPU::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
-        return false;
-    }
-
-    return true;
-}
-
-Tick
-MPU::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 4215f82d5b..a1e5055226 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -38,8 +38,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
-#include "mem/port.hh"
-#include "sim/clocked_object.hh"
+#include "sim/sim_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -48,33 +47,9 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public ClockedObject
+class MPU : public SimObject
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-        bool needSendRetryReq;
-        PortID _id;
-
-      public:
-        RespPort(const std::string& name, MPU* owner, PortID id):
-          ResponsePort(name, owner), 
-          owner(owner), needSendRetryReq(false), _id(id)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        PortID id() { return _id; }
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -82,20 +57,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    std::vector<RespPort> inPorts;
-
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    Port& getPort(const std::string& if_name,
-                PortID idx = InvalidPortID) override;
-    virtual void init() override;
+    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
-
     bool handleIncomingUpdate(PacketPtr pkt);
+
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -106,8 +77,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void checkRetryReq();
-
     void recvDoneSignal();
     bool done();
 };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0267bd46b6..9a548a3255 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -45,7 +45,30 @@ WLEngine::WLEngine(const WLEngineParams& params):
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
 
 void
 WLEngine::registerMPU(MPU* mpu)
@@ -53,6 +76,70 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+AddrRangeList 
+WLEngine::getAddrRanges()
+{ 
+    return owner->getAddrRanges(); 
+}
+
+void 
+WLEngine::recvFunctional(PacketPtr pkt)
+{ 
+    owner->recvFunctional(pkt); 
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 WLEngine::done()
 {
@@ -144,7 +231,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                owner->checkRetryReq();
+                checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
             }
         } else {
@@ -173,7 +260,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        owner->checkRetryReq();
+        checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f888979be9..5f08678d26 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,8 +45,34 @@ class MPU;
 class WLEngine : public BaseReduceEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
     MPU* owner;
 
+    std::vector<RespPort> inPorts;
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
@@ -86,11 +112,19 @@ class WLEngine : public BaseReduceEngine
   public:
     PARAMS(WLEngine);
     WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
     bool handleIncomingUpdate(PacketPtr pkt);
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
+    void checkRetryReq();
+
     bool done();
 };
 

From e14d4ad6a54234e7bad4240c6b00f879e0016570 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 3 Oct 2022 12:58:25 -0700
Subject: [PATCH 180/279] Updating dprintfs.

---
 src/accl/graph/sega/push_engine.cc | 50 ++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70c10cc358..9039eb408d 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,10 +109,12 @@ PushEngine::ReqPort::recvReqRetry()
     panic_if(blockedPacket == nullptr,
             "Received retry without a blockedPacket.");
 
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
     if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
         owner->recvReqRetry();
     }
 }
@@ -120,6 +122,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvReqRetry()
 {
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
     if (!nextUpdatePushEvent.scheduled()) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
@@ -325,7 +328,7 @@ PushEngine::processNextPropagateEvent()
         edge_list.pop_front();
 
         if (enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
@@ -363,14 +366,17 @@ PushEngine::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
+    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
+                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
+                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
                 if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
                     updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
                     accepted = true;
                     break;
                 }
@@ -408,23 +414,47 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
+    // for (int i = 0; i < updateQueues.size(); i++) {
+    //     if (updateQueues[i].empty()) {
+    //         continue;
+    //     }
+    //     if (outPorts[i].blocked()) {
+    //         continue;
+    //     }
+    //     Update update;
+    //     Tick entrance_tick;
+    //     std::tie(update, entrance_tick) = updateQueues[i].front();
+    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+    //     outPorts[i].sendPacket(pkt);
+    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+    //                 "%d.\n", __func__, update.src, update.dst, update.value);
+    //     updateQueues[i].pop_front();
+    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+    //     if (updateQueues[i].size() > 0) {
+    //         next_time_send += 1;
+    //     }
+    // }
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
             continue;
         }
-        if (outPorts[i].blocked()) {
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        if (updateQueues[outPorts[i].id()].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
+        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-        if (updateQueues[i].size() > 0) {
+        if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
     }

From 8c0146a9024ffb6651e5da324e867c99683f1d0b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 4 Oct 2022 12:49:29 -0700
Subject: [PATCH 181/279] Fixing the problems with retry

---
 configs/accl/sega.py               | 6 +++---
 src/accl/graph/sega/push_engine.cc | 8 ++++----
 src/accl/graph/sega/push_engine.hh | 3 ++-
 src/accl/graph/sega/wl_engine.cc   | 2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c6c2171315..6b198c5f4a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,8 +48,8 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=2,
+                                register_file_size=2
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=2
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9039eb408d..238b8a89fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -54,7 +54,6 @@ PushEngine::PushEngine(const Params& params):
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
                             name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
     }
 }
 
@@ -93,6 +92,7 @@ PushEngine::ReqPort::sendPacket(PacketPtr pkt)
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
         blockedPacket = pkt;
     }
 }
@@ -386,7 +386,7 @@ PushEngine::enqueueUpdate(Update update)
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    }
+    } 
 
     return accepted;
 }
@@ -448,10 +448,10 @@ PushEngine::processNextUpdatePushEvent()
         DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
+        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 99fec33f2c..4e0cdbc526 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -132,10 +132,11 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
     std::vector<ReqPort> outPorts;
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9a548a3255..116cdf3f77 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -98,8 +98,8 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        sendRetryReq();
         needSendRetryReq = false;
+        sendRetryReq();
     }
 }
 

From 020ebf7c3652d6b78fee5cb4e458cd73a533d937 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Oct 2022 14:10:57 -0700
Subject: [PATCH 182/279] Fixing done, code style and conifg. Adding a stat.

---
 configs/accl/sega-simple.py        |  68 ++++++-------
 configs/accl/sega-single-simple.py | 151 ----------------------------
 configs/accl/sega-single.py        | 155 -----------------------------
 configs/accl/sega.py               |  14 +--
 src/accl/graph/sega/mpu.cc         |   3 -
 src/accl/graph/sega/mpu.hh         |   1 -
 src/accl/graph/sega/push_engine.cc |  97 ++++++++++--------
 src/accl/graph/sega/push_engine.hh |   4 +-
 8 files changed, 90 insertions(+), 403 deletions(-)
 delete mode 100644 configs/accl/sega-single-simple.py
 delete mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index fffc273ee1..54a90281bf 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -48,20 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
@@ -88,14 +89,14 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
@@ -103,54 +104,39 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '4GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -167,11 +153,13 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
+
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
deleted file mode 100644
index eacb16d3d1..0000000000
--- a/configs/accl/sega-single-simple.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_ports
-    def setRespPort(self, port):
-        self.mpu.in_ports = port
-
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-    def getReqPort(self):
-        return self.mpu.out_ports
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_vertex_image(self, vertex_image):
-        self.vertex_mem_ctrl.image_file = vertex_image
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_range(AddrRange("4GiB"))
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
deleted file mode 100644
index e4f7942f42..0000000000
--- a/configs/accl/sega-single.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                            latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="32GiB/s"
-                                        )
-
-        self.edge_mem_ctrl = MemCtrl(
-                                    dram=DDR4_2400_8x8(
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                    )
-                                )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_port
-    def setRespPort(self, port):
-        self.mpu.in_port = port
-
-    def getReqPort(self):
-        return self.mpu.out_ports
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        plain_vertex_range = AddrRange("4GiB")
-        self._vertex_ranges = interleave_addresses(
-                                            plain_vertex_range,
-                                            1,
-                                            32
-                                            )
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 6b198c5f4a..fab414f2c5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,21 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=2,
-                                register_file_size=2
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=2
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -101,7 +101,7 @@ class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c8d0f636f2..44054d1efb 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -47,9 +47,6 @@ MPU::MPU(const Params& params):
     pushEngine->registerMPU(this);
 }
 
-MPU::~MPU()
-{}
-
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index a1e5055226..229bd28950 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -60,7 +60,6 @@ class MPU : public SimObject
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 238b8a89fb..5835b61fc6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -53,7 +53,7 @@ PushEngine::PushEngine(const Params& params):
 {
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
+                        name() + ".out_ports" + std::to_string(i), this, i);
     }
 }
 
@@ -144,9 +144,12 @@ PushEngine::workLeft()
 bool
 PushEngine::done()
 {
-    return edgeQueue.empty() &&
-            (onTheFlyMemReqs == 0) &&
-            edgePointerQueue.empty();
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+    }
+    return empty_update_queues && edgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
 
@@ -357,6 +360,16 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool found = false;
+    for (auto range: range_list) {
+        found |= range.contains(addr);
+    }
+    return found;
+}
+
 bool
 PushEngine::enqueueUpdate(Update update)
 {
@@ -369,24 +382,32 @@ PushEngine::enqueueUpdate(Update update)
     DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
-                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
-                    accepted = true;
-                    break;
-                }
+        if (contains(addr_range_list, dst_addr)) {
+            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
+                        __func__, update.to_string(), outPorts[i].id());
+            DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        updateQueues[outPorts[i].id()].size(),
+                        outPorts[i].id());
+            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue %d.\n", __func__, outPorts[i].id());
+                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
+                            "of queue for port %d is. Size of queue "
+                            "for port %d is %d.\n", __func__,
+                            outPorts[i].id(), outPorts[i].id(),
+                            updateQueues[outPorts[i].id()].size());
+                accepted = true;
+                stats.updateQueueLength.sample(
+                                        updateQueues[outPorts[i].id()].size());
             }
         }
     }
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    } 
+    }
 
     return accepted;
 }
@@ -414,46 +435,31 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    // for (int i = 0; i < updateQueues.size(); i++) {
-    //     if (updateQueues[i].empty()) {
-    //         continue;
-    //     }
-    //     if (outPorts[i].blocked()) {
-    //         continue;
-    //     }
-    //     Update update;
-    //     Tick entrance_tick;
-    //     std::tie(update, entrance_tick) = updateQueues[i].front();
-    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-    //     outPorts[i].sendPacket(pkt);
-    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-    //                 "%d.\n", __func__, update.src, update.dst, update.value);
-    //     updateQueues[i].pop_front();
-    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-    //     if (updateQueues[i].size() > 0) {
-    //         next_time_send += 1;
-    //     }
-    // }
-
     for (int i = 0; i < outPorts.size(); i++) {
         if (outPorts[i].blocked()) {
-            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
+                                __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Port %d available.\n",
+                                __func__, outPorts[i].id());
         if (updateQueues[outPorts[i].id()].empty()) {
-            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
+        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
+                    "Respective queue size is %d.\n", __func__,
+                    update.to_string(), outPorts[i].id(),
+                    updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
-        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
@@ -480,7 +486,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue.")
+             "Histogram of the latency of the edgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues.")
 {
 }
 
@@ -493,6 +501,7 @@ PushEngine::PushStats::regStats()
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
+    updateQueueLength.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4e0cdbc526..fbe527bcb6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -51,7 +51,7 @@ class PushEngine : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, PushEngine* owner, PortID id) :
-          RequestPort(name, owner), 
+          RequestPort(name, owner),
           owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
@@ -132,7 +132,6 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
@@ -170,6 +169,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram updateQueueLength;
     };
 
     PushStats stats;

From f2735feb55ef5605e7bd76de01ae97bf5376c040 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Oct 2022 15:35:54 -0700
Subject: [PATCH 183/279] Back indent.

---
 configs/accl/sega-simple.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 54a90281bf..93267f0f24 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -31,18 +31,18 @@
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(AddrRange(
+            start=plain_range.start,
+            size=plain_range.size(),
+            intlvHighBit=intlv_low_bit + intlv_bits - 1,
+            xorHighBit=0,
+            intlvBits=intlv_bits,
+            intlvMatch=i))
+    return ret
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):

From 91790268ecb76a19e02086317ca0e62d052bf40e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 7 Oct 2022 10:27:22 -0700
Subject: [PATCH 184/279] Fixed HBM range issue.

---
 configs/accl/sega-hbm.py                  | 163 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.cc |  11 +-
 src/accl/graph/sega/coalesce_engine.cc    |  27 ----
 src/base/addr_range.hh                    |  44 +++---
 src/mem/HBMCtrl.py                        |   2 +
 src/mem/hbm_ctrl.cc                       |  10 +-
 src/mem/hbm_ctrl.hh                       |   3 +-
 7 files changed, 202 insertions(+), 58 deletions(-)
 create mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
new file mode 100644
index 0000000000..da7d79d7fe
--- /dev/null
+++ b/configs/accl/sega-hbm.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=128,
+                                register_file_size=64
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64,
+                                    update_queue_size=16
+                                    )
+
+        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
+                                        dram_2=HBM_2000_4H_1x64())
+
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, num_mpus, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+
+        vertex_ranges = interleave_addresses(
+                                        AddrRange(start=0, size="4GiB"),
+                                        2*num_mpus,
+                                        32
+                                        )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
+            gpt.set_vertex_pch_bit(8)
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index d9864664b1..9f704f71e9 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -60,13 +60,10 @@ BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
 
-    if (memory_ranges.size() == 2) {
-        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
-    } else if (memory_ranges.size() == 1) {
-        peerMemoryRange = memory_ranges.front();
-    } else {
-        panic("Received an unacceptable number of ranges from memory.");
-    }
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
     DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
             "%s. The range is %s interleaved.\n", __func__,
             peerMemoryRange.to_string(),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0a4a041176..f4cd6a950d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,15 +127,6 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -145,15 +136,6 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -165,16 +147,7 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
-    // Addr upgraded_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(trimmed_addr)) {
-    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 92e45365b4..be2e96c200 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -750,33 +750,37 @@ class AddrRange
         return AddrRange(start, end);
 
     friend AddrRange
-    merge(const AddrRange& left, const AddrRange& right)
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
     {
         assert(left.interleaved());
         assert(right.interleaved());
         assert(left.mergesWith(right));
 
-        int bits_org = left.masks.size();
-        int bits_new = bits_org - 1;
-
-        int left_match = left.intlvMatch;
-        int right_match = right.intlvMatch;
-        assert(std::abs(left_match - right_match) == (1 << bits_new));
-
-        Addr last_mask = left.masks[left.masks.size() - 1];
-        int xor_high_bit_org = 0;
-        int xor_high_bit_new = 0;
-        if (!isPowerOf2(last_mask)) {
-            xor_high_bit_org = ceilLog2<Addr>(last_mask);
-            xor_high_bit_new = xor_high_bit_org - 2;
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
         }
-        int intlv_high_bit_org =
-                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
-        int intlv_high_bit_new = intlv_high_bit_org - 2;
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
 
-        int match = std::min(left_match, right_match);
-        return AddrRange(left._start, left._end, intlv_high_bit_new,
-                            xor_high_bit_new, bits_new, match);
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
     }
 };
 
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 45d89a76c9..f32ffe6f0a 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index f87fa2dcbb..62a3254364 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,6 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
+    pchBit(p.pch_bit),
     retryRdReqPC1(false), retryWrReqPC1(false),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
@@ -226,7 +227,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -487,8 +488,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index b17caa6b49..348152bf31 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-
+    // Position of the pseudochannel bit in addresses.
+    int pchBit;
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */

From 2eb773215301a174844da0b68151f2f320e4dc00 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 11:49:25 -0700
Subject: [PATCH 185/279] Refactoring reading edges from memory

---
 src/accl/graph/sega/push_engine.cc | 41 +++++++++++++-----------------
 src/accl/graph/sega/push_engine.hh | 10 ++++++--
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5835b61fc6..7265cec1a4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -148,7 +148,7 @@ PushEngine::done()
     for (int i = 0; i < outPorts.size(); i++) {
         empty_update_queues &= updateQueues[outPorts[i].id()].empty();
     }
-    return empty_update_queues && edgeQueue.empty() &&
+    return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
@@ -230,13 +230,13 @@ PushEngine::processNextMemoryReadEvent()
         nextMemoryReadEvent.sleep();
         return;
     }
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
-        Addr aligned_addr, offset;
-        int num_edges;
-
-        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
-        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
@@ -246,8 +246,9 @@ PushEngine::processNextMemoryReadEvent()
         reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
-        onTheFlyMemReqs++;
+        onTheFlyMemReqs += num_edges;
 
+        curr_info.iterate();
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
@@ -290,19 +291,16 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
-        edges.emplace_back(meta_edge, curTick());
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
-    assert(!edges.empty());
-    edgeQueue.push_back(edges);
 
-    onTheFlyMemReqs--;
+    onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
     delete pkt;
@@ -318,17 +316,16 @@ PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
         MetaEdge meta_edge;
         Tick entrance_tick;
-        std::tie(meta_edge, entrance_tick) = edge_list.front();
+        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
         uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
-        edge_list.pop_front();
+        metaEdgeQueue.pop_front();
 
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
@@ -337,14 +334,10 @@ PushEngine::processNextPropagateEvent()
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.emplace_back(meta_edge, entrance_tick);
-        }
-
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
 
-        if (edgeQueue.empty()) {
+        if (metaEdgeQueue.empty()) {
             break;
         }
 
@@ -355,7 +348,7 @@ PushEngine::processNextPropagateEvent()
     }
 
     assert(!nextPropagateEvent.scheduled());
-    if (!edgeQueue.empty()) {
+    if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
     }
 }
@@ -486,7 +479,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue."),
+             "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fbe527bcb6..cc087aff11 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -93,11 +93,17 @@ class PushEngine : public BaseMemoryEngine
             } else {
                 num_items = (_end - _start) / _step;
             }
-            _start = aligned_addr + _atom;
 
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
         bool done() { return (_start >= _end); }
 
         Addr src() { return _src; }
@@ -126,7 +132,7 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From fdc455a6896d4b161e358e014a13650083506684 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 13:33:25 -0700
Subject: [PATCH 186/279] Added statistics to calculate number of propagates
 sent

---
 src/accl/graph/sega/push_engine.cc | 10 +++++++---
 src/accl/graph/sega/push_engine.hh |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 7265cec1a4..4b3277d3e1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -336,17 +336,18 @@ PushEngine::processNextPropagateEvent()
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
+        num_propagates++;
 
         if (metaEdgeQueue.empty()) {
             break;
         }
-
-        num_propagates++;
         if (num_propagates >= maxPropagatesPerCycle) {
             break;
         }
     }
 
+    stats.numPropagates.sample(num_propagates);
+
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
@@ -481,7 +482,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
-             "Histogram of the length of updateQueues.")
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Histogram of number of propagates sent.")
 {
 }
 
@@ -495,6 +498,7 @@ PushEngine::PushStats::regStats()
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
+    numPropagates.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cc087aff11..c078391420 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -176,6 +176,7 @@ class PushEngine : public BaseMemoryEngine
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagates;
     };
 
     PushStats stats;

From 876670ca55542376bb0d2aad0282cd91c5322ee9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 8 Oct 2022 16:25:41 -0700
Subject: [PATCH 187/279] Adding coalescing to pushEngine

---
 src/accl/graph/sega/push_engine.cc | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4b3277d3e1..79e5344395 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -368,6 +368,7 @@ bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
+    bool fount_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -383,7 +384,26 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
+                      itr != updateQueues[outPorts[i].id()].end();
+                      itr++){
+                std::tuple curr_update = *itr;
+                if (std::get<0>(curr_update).dst == update.dst){
+                    uint32_t value = 
+                        std::min(std::get<0>(curr_update).value, update.value);
+                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
+                            "for destination %d new value: %d by comparing %d "
+                            "and %d. \n", __func__, update.dst, value,
+                            std::get<0>(curr_update).value, update.value);
+                    fount_coalescing = true;
+                    update.value = value;
+                    updateQueues[outPorts[i].id()].erase(itr);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    break;
+                }
+            }
+            if ((fount_coalescing == false) && 
+                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
                 updateQueues[outPorts[i].id()].emplace_back(update, curTick());
@@ -398,6 +418,7 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
+    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());

From cae9309500c8877b6b83a9b8dcebac4ed6014933 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 8 Oct 2022 19:49:58 -0700
Subject: [PATCH 188/279] Adding function to print final answer.

---
 configs/accl/sega-hbm.py                   | 18 +++--
 configs/accl/sega-simple.py                |  2 +-
 configs/accl/sega.py                       |  2 +-
 src/accl/graph/sega/CenteralController.py  |  5 +-
 src/accl/graph/sega/centeral_controller.cc | 44 +++++++++++-
 src/accl/graph/sega/centeral_controller.hh |  3 +
 src/accl/graph/sega/push_engine.cc         | 80 ++++++++++++----------
 src/accl/graph/sega/push_engine.hh         |  9 ++-
 src/base/addr_range.hh                     | 10 +++
 9 files changed, 125 insertions(+), 48 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index da7d79d7fe..70aac6c2cb 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=16
+                                    resp_queue_size=512,
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
@@ -136,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
 
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -143,14 +146,19 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("--verify", type=bool, help="Print final answer")
 
     args = argparser.parse_args()
 
+    verify = False
+    if not args.verify is None:
+        verify = args.verify
+
     return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
+        args.graph, args.init_addr, args.init_value, verify
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
@@ -161,3 +169,5 @@ def get_inputs():
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 93267f0f24..7ec19c92ae 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16,
+                                    update_queue_size=32,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index fab414f2c5..c50c525297 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0721ff977c..2ba53c231f 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -41,4 +41,7 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
+    cxx_exports = [
+                    PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("printAnswerToHostSimout")
+                ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 68b88e9e77..7c89c1edea 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
+#include "base/cprintf.hh"
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -62,7 +65,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    Addr maxVertexAddr = image.maxAddr();
+    maxVertexAddr = image.maxAddr();
 
     PortProxy proxy(
     [this](PacketPtr pkt) {
@@ -97,6 +100,21 @@ CenteralController::startup()
     }
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 0) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
@@ -134,4 +152,28 @@ CenteralController::recvDoneSignal()
     }
 }
 
+void
+CenteralController::printAnswerToHostSimout()
+{
+    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    {
+        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, addr)) {
+                mpu->recvFunctional(pkt);
+            }
+        }
+        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+                                        addr, i, items[i].to_string());
+
+            std::cout << print << std::endl;
+        }
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4a4e9c7cb1..d006851e3b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -53,6 +53,7 @@ class CenteralController : public ClockedObject
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
@@ -64,6 +65,8 @@ class CenteralController : public ClockedObject
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
+
+    void printAnswerToHostSimout();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 79e5344395..d5fb002f82 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,7 +43,6 @@ PushEngine::PushEngine(const Params& params):
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
-    workload(params.workload),
     updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
@@ -152,10 +151,23 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
+uint32_t
+PushEngine::reduce(uint32_t update, uint32_t value)
+{
+    std::string workload = params().workload;
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
 
 uint32_t
 PushEngine::propagate(uint32_t value, uint32_t weight)
 {
+    std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
         update = value + 1;
@@ -235,7 +247,7 @@ PushEngine::processNextMemoryReadEvent()
 
     EdgeReadInfoGen& curr_info = edgePointerQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
@@ -299,6 +311,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
@@ -330,7 +344,7 @@ PushEngine::processNextPropagateEvent()
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
-            stats.numUpdates++;
+            stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
@@ -346,7 +360,7 @@ PushEngine::processNextPropagateEvent()
         }
     }
 
-    stats.numPropagates.sample(num_propagates);
+    stats.numPropagatesHist.sample(num_propagates);
 
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
@@ -354,21 +368,11 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-bool
-contains(AddrRangeList range_list, Addr addr)
-{
-    bool found = false;
-    for (auto range: range_list) {
-        found |= range.contains(addr);
-    }
-    return found;
-}
-
 bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
-    bool fount_coalescing = false;
+    bool found_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -384,25 +388,21 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
-                      itr != updateQueues[outPorts[i].id()].end();
-                      itr++){
-                std::tuple curr_update = *itr;
-                if (std::get<0>(curr_update).dst == update.dst){
-                    uint32_t value = 
-                        std::min(std::get<0>(curr_update).value, update.value);
+            for (auto& entry: updateQueues[outPorts[i].id()]) {
+                Update& curr_update = std::get<0>(entry);
+                if (curr_update.dst == update.dst) {
+                    uint32_t old_value = curr_update.value;
+                    curr_update.value = reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
-                            "for destination %d new value: %d by comparing %d "
-                            "and %d. \n", __func__, update.dst, value,
-                            std::get<0>(curr_update).value, update.value);
-                    fount_coalescing = true;
-                    update.value = value;
-                    updateQueues[outPorts[i].id()].erase(itr);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    break;
+                            "for destination %d with new value: %d by "
+                            "coalescing %d and %d. \n", __func__, update.dst,
+                            curr_update.value, old_value, update.value);
+                    found_coalescing = true;
+                    accepted = true;
+                    stats.updateQueueCoalescions++;
                 }
             }
-            if ((fount_coalescing == false) && 
+            if ((found_coalescing == false) &&
                 (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
@@ -418,7 +418,6 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
-    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
@@ -478,6 +477,7 @@ PushEngine::processNextUpdatePushEvent()
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
+        stats.numUpdates++;
     }
 
     assert(!nextUpdatePushEvent.scheduled());
@@ -489,12 +489,18 @@ PushEngine::processNextUpdatePushEvent()
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-    ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
     ADD_STAT(numIdleCycles, statistics::units::Count::get(),
              "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second."),
@@ -504,7 +510,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
-    ADD_STAT(numPropagates, statistics::units::Count::get(),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
              "Histogram of number of propagates sent.")
 {
 }
@@ -514,12 +520,12 @@ PushEngine::PushStats::regStats()
 {
     using namespace statistics;
 
-    TEPS = numUpdates / simSeconds;
+    TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
-    numPropagates.init(push.params().max_propagates_per_cycle);
+    numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c078391420..6163ba5c27 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -134,7 +134,7 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
@@ -167,16 +167,19 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
-      statistics::Scalar numUpdates;
+      statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
       statistics::Scalar numIdleCycles;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
 
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
-      statistics::Histogram numPropagates;
+      statistics::Histogram numPropagatesHist;
     };
 
     PushStats stats;
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index be2e96c200..526db62606 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -867,6 +867,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__

From 714f5c3cffff70940282e2ab0bebc0d88a3dbc4c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 9 Oct 2022 17:15:04 -0700
Subject: [PATCH 189/279] Typos.

---
 configs/accl/real-graph-gen.py                | 74 +++++++++++++++++++
 configs/accl/sega-hbm.py                      | 14 ++--
 .../accl/{graph-gen.py => synth-graph-gen.py} |  0
 src/accl/graph/sega/centeral_controller.cc    |  2 +-
 src/accl/graph/sega/wl_engine.cc              | 12 +--
 src/accl/graph/sega/wl_engine.hh              |  2 +-
 6 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 configs/accl/real-graph-gen.py
 rename configs/accl/{graph-gen.py => synth-graph-gen.py} (100%)

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
new file mode 100644
index 0000000000..db44c63a9a
--- /dev/null
+++ b/configs/accl/real-graph-gen.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("path", type=str, help="Path to the graph file.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.path, args.num_gpts
+
+if __name__ == "__main__":
+    graph_path, num_gpts = get_inputs()
+
+    graph_reader = os.environ.get("GRAPH_READER")
+
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+
+    if not os.path.exists(graph_path):
+        raise ValueError(f"{graph_path} does not exist.")
+
+    graph_dir = os.path.dirname(graph_path)
+    if not "binaries" in os.listdir(graph_dir):
+        print(f"binaries directory not found in {graph_dir}")
+        os.mkdir(f"{graph_dir}/binaries")
+        print(f"Created {graph_dir}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
+        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_dir}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 70aac6c2cb..cdc752f2bd 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -42,7 +42,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
                 xorHighBit=0,
                 intlvBits=intlv_bits,
                 intlvMatch=i))
-        return ret
+        return ret, intlv_low_bit + intlv_bits - 1
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
@@ -112,17 +112,17 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        2*num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            2*num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(8)
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
diff --git a/configs/accl/graph-gen.py b/configs/accl/synth-graph-gen.py
similarity index 100%
rename from configs/accl/graph-gen.py
rename to configs/accl/synth-graph-gen.py
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7c89c1edea..82e63d512e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -168,7 +168,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
                                         addr, i, items[i].to_string());
 
             std::cout << print << std::endl;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 116cdf3f77..eb2006a3df 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -76,16 +76,16 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-AddrRangeList 
+AddrRangeList
 WLEngine::getAddrRanges()
-{ 
-    return owner->getAddrRanges(); 
+{
+    return owner->getAddrRanges();
 }
 
-void 
+void
 WLEngine::recvFunctional(PacketPtr pkt)
-{ 
-    owner->recvFunctional(pkt); 
+{
+    owner->recvFunctional(pkt);
 }
 
 AddrRangeList
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5f08678d26..7578044cbf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -54,7 +54,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner, PortID id):
-          ResponsePort(name, owner), 
+          ResponsePort(name, owner),
           owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;

From 86051598667bd63a0ffc2f999108c22666068e1c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:07:29 -0700
Subject: [PATCH 190/279] Adding functions to move value to and from float.

---
 src/accl/graph/base/data_structs.hh | 24 +++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.cc  | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh  | 11 ++++-------
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 34c8eb98ce..3753e10d62 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -33,6 +33,8 @@
 #include "base/intmath.hh"
 
 #include <list>
+#include <cassert>
+#include <cstring>
 
 namespace gem5
 {
@@ -96,7 +98,7 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
     {}
     MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
         src(src), dst(dst), weight(weight), value(value)
@@ -176,6 +178,26 @@ class UniqueFIFO
     }
 };
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5fb002f82..cd795eaf00 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -220,10 +220,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(
-                            start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr,
-                            (uint32_t) wl.prop, curTick());
+    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(info_gen, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -245,7 +244,8 @@ PushEngine::processNextMemoryReadEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
     if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
@@ -264,8 +264,7 @@ PushEngine::processNextMemoryReadEvent()
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
-                                (curTick() - curr_info.entrance()) *
-                                1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6163ba5c27..acf012b24d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -73,12 +73,11 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
-        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value, Tick entrance):
-                        _start(start), _end(end), _step(step), _atom(atom),
-                        _src(src), _value(value), _entrance(entrance)
+                        size_t atom, Addr src, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -108,8 +107,6 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
-
-        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -126,7 +123,7 @@ class PushEngine : public BaseMemoryEngine
 
     int numPendingPulls;
     int edgePointerQueueSize;
-    std::deque<EdgeReadInfoGen> edgePointerQueue;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;

From b33e95109cba41a5e4ac268ef28a72e957be59b3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:54:40 -0700
Subject: [PATCH 191/279] Adding sssp and pr.

---
 src/accl/graph/sega/CoalesceEngine.py  |  2 ++
 src/accl/graph/sega/PushEngine.py      |  3 ++
 src/accl/graph/sega/coalesce_engine.cc | 29 ++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 40 ++++++++++++++++++++++----
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  8 +++++-
 7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index f6e997f1e3..eeba279b7a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -44,3 +44,5 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "requestor in each cycle. Used to limit b/w.")
 
     workload = Param.String("BFS", "Name of the workload")
+
+    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 5e0d2b3212..52dc0e2506 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -51,3 +51,6 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
+
+    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
+    
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f4cd6a950d..91072a1da8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), workload(params.workload),
+    _workCount(0), numPullsReceived(0), 
+    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -110,16 +111,20 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-uint32_t
-CoalesceEngine::reduce(uint32_t update, uint32_t value)
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    uint32_t new_value;
     if(workload == "BFS"){
-        new_value = std::min(update, value);
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
     } else{
-        panic("Workload not implemented\n");
+        panic("The workload is not recognize");
     }
-    return new_value;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -639,7 +644,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+    if (applyCondition(
+            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
         cacheBlocks[block_index].items[wl_offset] = wl;
         cacheBlocks[block_index].needsApply |= true;
         // NOTE: We don't set needsWB and rely on processNextApplyEvent to
@@ -747,12 +753,7 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            // NOTE: It might be the case that for workloads other than BFS,
-            // the reduce function here should be different to the reduce
-            // function defined in WLEngine. Think about the case of PR in
-            // detail.
-            uint32_t new_prop = reduce(
-                cacheBlocks[block_index].items[index].tempProp, current_prop);
+            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b1f5b1fea1..a087f37b4d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -131,6 +131,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     std::string workload;
     uint32_t reduce(uint32_t update, uint32_t value);
+    bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cd795eaf00..c9efa03f08 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,6 +158,10 @@ PushEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+    } else if(workload == "PR"){
+        new_value = update + value;
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
         panic("Workload not implemented\n");
     }
@@ -165,19 +169,42 @@ PushEngine::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PushEngine::propagate(uint32_t value, uint32_t weight)
+PushEngine::propagate(uint32_t delta, uint32_t weight)
 {
     std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
-        update = value + 1;
-    }
-    else{
+        update = delta + 1;
+    } else if (workload == "SSSP")  {
+        update = delta + weight;
+    } else if (workload == "PR")  {
+        float float_form = writeToFloat<uint32_t>(delta);
+        float float_update = float_form * weight * params().alpha;
+        update = readFromFloat<uint32_t>(float_update);
+    } else{
         panic("The workload %s is not supported", workload);
     }
     return update;
 }
 
+uint32_t
+PushEngine::calculateValue(WorkListItem wl)
+{
+    std::string workload = params().workload;
+    uint32_t delta;
+    if (workload == "PR")  {
+        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+        delta = readFromFloat<uint32_t>(property);
+    } else if (workload == "BFS") {
+        delta = wl.prop;
+    } else if (workload == "SSSP") {
+        delta = wl.prop;
+    } else {
+        panic("Workload not supported.");
+    }
+    return delta;
+}
+
 void
 PushEngine::start()
 {
@@ -220,9 +247,11 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
+    uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+                            peerMemoryAtomSize, addr, value);
     edgePointerQueue.emplace_back(info_gen, curTick());
+    
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -256,7 +285,6 @@ PushEngine::processNextMemoryReadEvent()
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
-
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index acf012b24d..c03e78851c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,6 +133,7 @@ class PushEngine : public BaseMemoryEngine
 
     uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
+    uint32_t calculateValue(WorkListItem wl);
 
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index eb2006a3df..f684650f23 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,8 +152,14 @@ WLEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+   } else if(workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        new_value = readFromFloat<uint32_t>(float_update + float_value);
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
-        panic("Workload not implemented\n");
+        panic("Workload not implemented.");
     }
     return new_value;
 }

From e404933980e1dc06ac9646233ddd53d4b79d569e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 21:23:27 -0700
Subject: [PATCH 192/279] making workload appropriate inits

---
 src/accl/graph/sega/CenteralController.py  |  1 +
 src/accl/graph/sega/centeral_controller.cc | 17 +++++---
 src/accl/graph/sega/centeral_controller.hh |  1 +
 src/accl/graph/sega/coalesce_engine.cc     | 51 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 5 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 2ba53c231f..ebc8281641 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,5 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 82e63d512e..9231f96379 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -71,11 +71,8 @@ CenteralController::initState()
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(pkt->getAddr())) {
-                    mpu->recvFunctional(pkt);
-                    break;
-                }
+            if (contains(range_list, pkt->getAddr())) {
+                mpu->recvFunctional(pkt);
             }
         }
     }, system->cacheLineSize());
@@ -139,6 +136,16 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createInitialPRUpdate()
+{
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+}
+
 void
 CenteralController::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index d006851e3b..5b0f5d6816 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -64,6 +64,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createInitialPRUpdate();
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 91072a1da8..92ad346b30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,6 +75,40 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::algoInit(PacketPtr pkt)
+{
+    WorkListItem items[numElementsPerLine];
+    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    if(workload == "PR") {
+        //TODO: Add Alpha
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
+    }
+    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+}
+
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
+{
+    if(workload == "BFS"){
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
+    } else{
+        panic("The workload is not recognize");
+    }
+}
+
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -100,6 +134,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
+        algoInit(pkt);
         memPort.sendFunctional(pkt);
     }
 }
@@ -111,22 +146,6 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS"){
-        return update != value;
-    } else if (workload == "SSSP"){
-        return  update < value;
-    } else if (workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
-        panic("The workload is not recognize");
-    }
-}
-
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a087f37b4d..49ee441ed3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -130,7 +130,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
+    void algoInit(PacketPtr pkt);
     bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;

From 56007c93d50a34fb094325e48e48ab72e8312f3f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Oct 2022 14:41:19 -0700
Subject: [PATCH 193/279] wip for implementing prewB and prePush apply
 functions.

---
 src/accl/graph/sega/CoalesceEngine.py  |  7 ++-
 src/accl/graph/sega/WLEngine.py        |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 61 +++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 ++
 src/accl/graph/sega/mpu.hh             |  2 +
 src/accl/graph/sega/push_engine.hh     |  2 +
 6 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index eeba279b7a..a50a814e89 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,6 +43,11 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
+    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
     workload = Param.String("BFS", "Name of the workload")
 
-    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
+    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 91325ab53f..7fe392cc9e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -45,4 +45,4 @@ class WLEngine(BaseReduceEngine):
                                     "many updates as this queueu has "
                                     "entries at the same time.")
 
-    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
+    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92ad346b30..4e1fe79899 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), 
+    _workCount(0), numPullsReceived(0),
+    postApplyWBQueueSize(params.post_apply_wb_queue_size),
     workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -67,6 +68,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
+
+    // TODO: Get rid of these booleans.
+    // applyBeforeWB = true;
+    // if (workload == "PR") {
+    //     applyBeforeWB = false;
+    // }
+    // applyBeforePush = false;
+    // if (workload == "PR") {
+    //     applyBeforePush = true;
+    // }
 }
 
 void
@@ -84,7 +95,7 @@ CoalesceEngine::algoInit(PacketPtr pkt)
         //TODO: Add Alpha
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].tempProp = readFromFloat<uint32_t>(0);
             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
@@ -96,15 +107,15 @@ CoalesceEngine::algoInit(PacketPtr pkt)
 bool
 CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    if(workload == "BFS"){
+    if(workload == "BFS") {
         return update != value;
-    } else if (workload == "SSSP"){
+    } else if (workload == "SSSP") {
         return  update < value;
-    } else if (workload == "PR"){
+    } else if (workload == "PR") {
         float float_value = writeToFloat<uint32_t>(value);
         float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
+        return  params().threshold <= abs(float_update - float_value);
+    } else {
         panic("The workload is not recognize");
     }
 }
@@ -663,14 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (applyCondition(
-            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
-        cacheBlocks[block_index].items[wl_offset] = wl;
-        cacheBlocks[block_index].needsApply |= true;
-        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
-        // set that bit.
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
+    if (applyCondition(wl.tempProp,
+                        cacheBlocks[block_index].items[wl_offset].prop)) {
+        cacheBlocks[block_index].needsApply |= true;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -773,10 +785,13 @@ CoalesceEngine::processNextApplyEvent()
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
             uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (new_prop != current_prop) {
-                cacheBlocks[block_index].items[index].tempProp = new_prop;
-                cacheBlocks[block_index].items[index].prop = new_prop;
-
+            if (applyCondition(new_prop, current_prop)) {
+                if (applyBeforeWB) {
+                    cacheBlocks[block_index].items[index].tempProp = new_prop;
+                    cacheBlocks[block_index].items[index].prop = new_prop;
+                }
+                // TODO: Implement this function
+                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
@@ -1046,6 +1061,18 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
+
+            // TODO: Implement a function like this.
+            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
+            // TODO: After implementing the above function get rid of this bool
+            // if (applyBeforePush) {
+            //     cacheBlocks[block_index].items[wl_offset].prop =
+            //         cacheBlocks[block_index].items[wl_offset].tempProp;
+            // }
+            // TODO: Implement recvVertexPush2 in PushEngine.
+            // owner->recvVertexPush2(vertex_addr, delta,
+            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
+            //             cacheBlocks[block_index].items[wl_offset].degree);
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
             stats.verticesPushed++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 49ee441ed3..c9564ac187 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,11 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    bool applyBeforeWB;
+    bool applyBeforePush;
     int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
+    int postApplyWBQueueSize;
+    std::deque<WorkListItem> postApplyWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 229bd28950..9dcb9de5d7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,6 +75,8 @@ class MPU : public SimObject
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c03e78851c..ec0dd09e43 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();
 

From 2dd3d4d3bc7fc11ccfbdff22a2f46d49c5cc00e4 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 14 Oct 2022 16:24:09 -0700
Subject: [PATCH 194/279] Adding GraphWorkload class.

---
 configs/accl/sega-hbm.py                   |   7 +-
 src/accl/graph/base/SConscript             |   1 +
 src/accl/graph/base/data_structs.hh        |   3 +-
 src/accl/graph/base/graph_workload.cc      |  66 ++++++++++++
 src/accl/graph/base/graph_workload.hh      |  74 +++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 src/accl/graph/sega/centeral_controller.hh |   4 +
 src/accl/graph/sega/coalesce_engine.cc     |  76 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 src/accl/graph/sega/mpu.cc                 |   8 ++
 src/accl/graph/sega/mpu.hh                 |   1 +
 src/accl/graph/sega/push_engine.cc         | 115 +++++++++++----------
 src/accl/graph/sega/push_engine.hh         |   5 +-
 src/accl/graph/sega/wl_engine.cc           |  39 ++++---
 src/accl/graph/sega/wl_engine.hh           |   5 +-
 16 files changed, 302 insertions(+), 122 deletions(-)
 create mode 100644 src/accl/graph/base/graph_workload.cc
 create mode 100644 src/accl/graph/base/graph_workload.hh

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index cdc752f2bd..50fd5f3069 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -56,7 +56,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             cache_size=cache_size,
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
+                                            max_resp_per_cycle=8,
+                                            post_apply_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -135,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+        
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
@@ -166,6 +170,7 @@ def get_inputs():
     m5.instantiate()
 
     system.create_initial_bfs_update(init_addr, init_value)
+    system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8b741abfc8..35111c34d2 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -30,3 +30,4 @@ Import("*")
 SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
 Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 3753e10d62..2d81375b63 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,10 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <list>
+#include <algorithm>
 #include <cassert>
 #include <cstring>
+#include <list>
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..3d0d45b1de
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+namespace gem5 
+{
+
+uint32_t 
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool 
+BFSWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp < wl.prop;
+}
+
+bool
+BFSWorkload::preWBApply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.degree > 0;
+}
+
+std::tuple<uint32_t, bool> 
+BFSWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t value = wl.prop;
+    return std::make_tuple(value, false);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..304b434a3d
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual bool applyCondition(WorkListItem wl) = 0;
+    virtual bool preWBApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        GraphWorkload(), 
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index ebc8281641..17badf9ec4 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,7 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 9231f96379..2074f69f08 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -83,6 +83,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
+    for (auto mpu: mpuVector) {
+        mpu->recvWorkload(workload);
+    }
+
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
@@ -136,6 +140,12 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createInitialPRUpdate()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 5b0f5d6816..1f1df00b4b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
@@ -47,6 +48,8 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
+    GraphWorkload* workload;
+
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -64,6 +67,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createInitialPRUpdate();
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4e1fe79899..20bfaf8481 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -68,16 +68,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
-
-    // TODO: Get rid of these booleans.
-    // applyBeforeWB = true;
-    // if (workload == "PR") {
-    //     applyBeforeWB = false;
-    // }
-    // applyBeforePush = false;
-    // if (workload == "PR") {
-    //     applyBeforePush = true;
-    // }
 }
 
 void
@@ -90,9 +80,10 @@ void
 CoalesceEngine::algoInit(PacketPtr pkt)
 {
     WorkListItem items[numElementsPerLine];
-    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    
     if(workload == "PR") {
         //TODO: Add Alpha
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
             items[i].tempProp = readFromFloat<uint32_t>(0);
@@ -100,25 +91,39 @@ CoalesceEngine::algoInit(PacketPtr pkt)
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
         }
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
     }
-    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+    
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS") {
-        return update != value;
-    } else if (workload == "SSSP") {
-        return  update < value;
-    } else if (workload == "PR") {
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().threshold <= abs(float_update - float_value);
-    } else {
-        panic("The workload is not recognize");
-    }
-}
+// bool
+// CoalesceEngine::applyCondition(WorkListItem wl)
+// {
+//     if (workload == "BFS") {
+//         return wl.tempProp != wl.prop;
+//     } else if (workload == "SSSP") {
+//         return  wl.tempProp < wl.prop;
+//     } else if (workload == "PR") {
+//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
+//         float float_prop = writeToFloat<uint32_t>(wl.prop);
+//         return  params().threshold <= abs(float_prop - float_temp);
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
+
+// bool
+// CoalesceEngine::preWBApply(WorkListItem& wl)
+// {
+//     if (workload == "BFS") {
+//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
+//         wl.tempProp = new_prop;
+//         wl.prop = new_prop;
+//         return wl.degree > 0;  
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -678,11 +683,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
-    if (applyCondition(wl.tempProp,
-                        cacheBlocks[block_index].items[wl_offset].prop)) {
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
     }
-    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -783,19 +787,13 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (applyCondition(new_prop, current_prop)) {
-                if (applyBeforeWB) {
-                    cacheBlocks[block_index].items[index].tempProp = new_prop;
-                    cacheBlocks[block_index].items[index].prop = new_prop;
-                }
+            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
                 // TODO: Implement this function
-                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
+                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
-                if (cacheBlocks[block_index].items[index].degree > 0) {
+                if (do_push) {
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9564ac187..3492cab9dc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,8 +31,9 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -134,8 +135,11 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
+    GraphWorkload* graphWorkload;
+
     void algoInit(PacketPtr pkt);
-    bool applyCondition(uint32_t update, uint32_t value);
+    bool applyCondition(WorkListItem wl);
+    bool preWBApply(WorkListItem& wl);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -203,6 +207,7 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
     bool recvWLRead(Addr addr);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 44054d1efb..70f1e05f32 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -71,6 +71,14 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 9dcb9de5d7..8f6101c325 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -69,6 +69,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9efa03f08..a661a755b7 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,59 +151,59 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-uint32_t
-PushEngine::reduce(uint32_t update, uint32_t value)
-{
-    std::string workload = params().workload;
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-    } else if(workload == "PR"){
-        new_value = update + value;
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented\n");
-    }
-    return new_value;
-}
-
-uint32_t
-PushEngine::propagate(uint32_t delta, uint32_t weight)
-{
-    std::string workload = params().workload;
-    uint32_t update;
-    if (workload == "BFS")  {
-        update = delta + 1;
-    } else if (workload == "SSSP")  {
-        update = delta + weight;
-    } else if (workload == "PR")  {
-        float float_form = writeToFloat<uint32_t>(delta);
-        float float_update = float_form * weight * params().alpha;
-        update = readFromFloat<uint32_t>(float_update);
-    } else{
-        panic("The workload %s is not supported", workload);
-    }
-    return update;
-}
-
-uint32_t
-PushEngine::calculateValue(WorkListItem wl)
-{
-    std::string workload = params().workload;
-    uint32_t delta;
-    if (workload == "PR")  {
-        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-        delta = readFromFloat<uint32_t>(property);
-    } else if (workload == "BFS") {
-        delta = wl.prop;
-    } else if (workload == "SSSP") {
-        delta = wl.prop;
-    } else {
-        panic("Workload not supported.");
-    }
-    return delta;
-}
+// uint32_t
+// PushEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     std::string workload = params().workload;
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//     } else if(workload == "PR"){
+//         new_value = update + value;
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented\n");
+//     }
+//     return new_value;
+// }
+
+// uint32_t
+// PushEngine::propagate(uint32_t delta, uint32_t weight)
+// {
+//     std::string workload = params().workload;
+//     uint32_t update;
+//     if (workload == "BFS")  {
+//         update = delta + 1;
+//     } else if (workload == "SSSP")  {
+//         update = delta + weight;
+//     } else if (workload == "PR")  {
+//         float float_form = writeToFloat<uint32_t>(delta);
+//         float float_update = float_form * weight * params().alpha;
+//         update = readFromFloat<uint32_t>(float_update);
+//     } else{
+//         panic("The workload %s is not supported", workload);
+//     }
+//     return update;
+// }
+
+// uint32_t
+// PushEngine::calculateValue(WorkListItem wl)
+// {
+//     std::string workload = params().workload;
+//     uint32_t delta;
+//     if (workload == "PR")  {
+//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+//         delta = readFromFloat<uint32_t>(property);
+//     } else if (workload == "BFS") {
+//         delta = wl.prop;
+//     } else if (workload == "SSSP") {
+//         delta = wl.prop;
+//     } else {
+//         panic("Workload not supported.");
+//     }
+//     return delta;
+// }
 
 void
 PushEngine::start()
@@ -247,9 +247,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    uint32_t value = calculateValue(wl);
+    // uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, value);
+                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
     
     numPendingPulls--;
@@ -364,7 +364,8 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        uint32_t update_value = 
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
 
@@ -419,7 +420,7 @@ PushEngine::enqueueUpdate(Update update)
                 Update& curr_update = std::get<0>(entry);
                 if (curr_update.dst == update.dst) {
                     uint32_t old_value = curr_update.value;
-                    curr_update.value = reduce(old_value, update.value);
+                    curr_update.value = graphWorkload->reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
                             "for destination %d with new value: %d by "
                             "coalescing %d and %d. \n", __func__, update.dst,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ec0dd09e43..47db96d818 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,8 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -115,6 +116,7 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     bool _running;
     Tick lastIdleEntranceTick;
@@ -194,6 +196,7 @@ class PushEngine : public BaseMemoryEngine
     virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     void start();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f684650f23..86acd40b69 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,7 +41,6 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
-    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -146,23 +145,23 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-uint32_t
-WLEngine::reduce(uint32_t update, uint32_t value)
-{
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-   } else if(workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        new_value = readFromFloat<uint32_t>(float_update + float_value);
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented.");
-    }
-    return new_value;
-}
+// uint32_t
+// WLEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//    } else if(workload == "PR"){
+//         float float_value = writeToFloat<uint32_t>(value);
+//         float float_update = writeToFloat<uint32_t>(update);
+//         new_value = readFromFloat<uint32_t>(float_update + float_value);
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented.");
+//     }
+//     return new_value;
+// }
 
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
@@ -251,7 +250,7 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                    reduce(update_value, registerFile[update_addr]);
+                graphWorkload->reduce(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -310,7 +309,7 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    reduce(update_value, workListFile[addr].tempProp);
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 7578044cbf..0d0e532269 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -70,7 +71,8 @@ class WLEngine : public BaseReduceEngine
     };
 
     MPU* owner;
-
+    GraphWorkload* graphWorkload;
+    
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -118,6 +120,7 @@ class WLEngine : public BaseReduceEngine
     void registerMPU(MPU* mpu);
 
     AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     void recvFunctional(PacketPtr pkt);
 
     bool handleIncomingUpdate(PacketPtr pkt);

From 718a8375db6febcf21cd0195b845c4af24d1d61d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 15 Oct 2022 16:59:05 -0700
Subject: [PATCH 195/279] Cleaning up.

---
 src/accl/graph/sega/CoalesceEngine.py      |  3 -
 src/accl/graph/sega/PushEngine.py          |  7 +--
 src/accl/graph/sega/WLEngine.py            |  2 -
 src/accl/graph/sega/centeral_controller.cc |  5 +-
 src/accl/graph/sega/coalesce_engine.cc     | 64 ++++++----------------
 src/accl/graph/sega/coalesce_engine.hh     |  8 +--
 src/accl/graph/sega/push_engine.cc         | 58 +-------------------
 src/accl/graph/sega/push_engine.hh         |  4 --
 src/accl/graph/sega/wl_engine.cc           | 18 ------
 src/accl/graph/sega/wl_engine.hh           |  6 +-
 10 files changed, 23 insertions(+), 152 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a50a814e89..d462d618e6 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -48,6 +48,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
-    workload = Param.String("BFS", "Name of the workload")
-
-    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 52dc0e2506..20c5452d43 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    workload = Param.String("BFS", "Name of the workload.")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -43,7 +41,7 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory.")
-    
+
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
@@ -51,6 +49,3 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
-    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
-    
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 7fe392cc9e..5a8ed9c9fd 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -44,5 +44,3 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.")
-
-    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 2074f69f08..fd282834e9 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -55,6 +55,7 @@ CenteralController::initState()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->recvWorkload(workload);
     }
     const auto& file = params().image_file;
     if (file == "")
@@ -83,10 +84,6 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    for (auto mpu: mpuVector) {
-        mpu->recvWorkload(workload);
-    }
-
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 20bfaf8481..fa5099353e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     postApplyWBQueueSize(params.post_apply_wb_queue_size),
-    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -76,52 +75,22 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-CoalesceEngine::algoInit(PacketPtr pkt)
-{
-    WorkListItem items[numElementsPerLine];
-    
-    if(workload == "PR") {
-        //TODO: Add Alpha
-        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(0);
-            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-        }
-        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-    }
-    
-}
-
-// bool
-// CoalesceEngine::applyCondition(WorkListItem wl)
-// {
-//     if (workload == "BFS") {
-//         return wl.tempProp != wl.prop;
-//     } else if (workload == "SSSP") {
-//         return  wl.tempProp < wl.prop;
-//     } else if (workload == "PR") {
-//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
-//         float float_prop = writeToFloat<uint32_t>(wl.prop);
-//         return  params().threshold <= abs(float_prop - float_temp);
-//     } else {
-//         panic("The workload is not recognized.");
-//     }
-// }
-
-// bool
-// CoalesceEngine::preWBApply(WorkListItem& wl)
+// void
+// CoalesceEngine::algoInit(PacketPtr pkt)
 // {
-//     if (workload == "BFS") {
-//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
-//         wl.tempProp = new_prop;
-//         wl.prop = new_prop;
-//         return wl.degree > 0;  
-//     } else {
-//         panic("The workload is not recognized.");
+//     WorkListItem items[numElementsPerLine];
+
+//     if(workload == "PR") {
+//         //TODO: Add Alpha
+//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+//         int bit_index_base = getBitIndexBase(pkt->getAddr());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             items[i].tempProp = readFromFloat<uint32_t>(0);
+//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//         }
+//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 //     }
 // }
 
@@ -150,7 +119,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        algoInit(pkt);
+        // TODO: Add and implement init function for GraphWorkload.
+        // graphWorkload->init(pkt);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3492cab9dc..0a2c0ca5ff 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -103,6 +103,7 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     int numLines;
     int numElementsPerLine;
@@ -134,13 +135,6 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
-    std::string workload;
-    GraphWorkload* graphWorkload;
-
-    void algoInit(PacketPtr pkt);
-    bool applyCondition(WorkListItem wl);
-    bool preWBApply(WorkListItem& wl);
-
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a661a755b7..c54f19307f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,60 +151,6 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-// uint32_t
-// PushEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     std::string workload = params().workload;
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//     } else if(workload == "PR"){
-//         new_value = update + value;
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented\n");
-//     }
-//     return new_value;
-// }
-
-// uint32_t
-// PushEngine::propagate(uint32_t delta, uint32_t weight)
-// {
-//     std::string workload = params().workload;
-//     uint32_t update;
-//     if (workload == "BFS")  {
-//         update = delta + 1;
-//     } else if (workload == "SSSP")  {
-//         update = delta + weight;
-//     } else if (workload == "PR")  {
-//         float float_form = writeToFloat<uint32_t>(delta);
-//         float float_update = float_form * weight * params().alpha;
-//         update = readFromFloat<uint32_t>(float_update);
-//     } else{
-//         panic("The workload %s is not supported", workload);
-//     }
-//     return update;
-// }
-
-// uint32_t
-// PushEngine::calculateValue(WorkListItem wl)
-// {
-//     std::string workload = params().workload;
-//     uint32_t delta;
-//     if (workload == "PR")  {
-//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-//         delta = readFromFloat<uint32_t>(property);
-//     } else if (workload == "BFS") {
-//         delta = wl.prop;
-//     } else if (workload == "SSSP") {
-//         delta = wl.prop;
-//     } else {
-//         panic("Workload not supported.");
-//     }
-//     return delta;
-// }
-
 void
 PushEngine::start()
 {
@@ -251,7 +197,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
                             peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
-    
+
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -364,7 +310,7 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = 
+        uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 47db96d818..1112176897 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,10 +133,6 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    uint32_t reduce(uint32_t update, uint32_t value);
-    uint32_t propagate(uint32_t value, uint32_t weight);
-    uint32_t calculateValue(WorkListItem wl);
-
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 86acd40b69..85fe9be2ca 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -145,24 +145,6 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-// uint32_t
-// WLEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//    } else if(workload == "PR"){
-//         float float_value = writeToFloat<uint32_t>(value);
-//         float float_update = writeToFloat<uint32_t>(update);
-//         new_value = readFromFloat<uint32_t>(float_update + float_value);
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented.");
-//     }
-//     return new_value;
-// }
-
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0d0e532269..f442d6060e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -72,7 +72,7 @@ class WLEngine : public BaseReduceEngine
 
     MPU* owner;
     GraphWorkload* graphWorkload;
-    
+
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -81,12 +81,8 @@ class WLEngine : public BaseReduceEngine
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
     std::unordered_map<Addr, Tick> vertexReadTime;
-
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From dd9cebb73b1f4aa234ffa9258c62a13456a0f552 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 16 Oct 2022 17:05:07 -0700
Subject: [PATCH 196/279] Implementing post push wb buffer.

---
 src/accl/graph/base/graph_workload.cc  |  19 +-
 src/accl/graph/base/graph_workload.hh  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py  |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 239 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  10 +-
 src/accl/graph/sega/mpu.cc             |  12 +-
 src/accl/graph/sega/mpu.hh             |   4 +-
 src/accl/graph/sega/push_engine.cc     |  17 +-
 src/accl/graph/sega/push_engine.hh     |  23 ++-
 9 files changed, 223 insertions(+), 109 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3d0d45b1de..6a8e000515 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,10 +28,10 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
-namespace gem5 
+namespace gem5
 {
 
-uint32_t 
+uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
     return std::min(update, value);
@@ -43,7 +43,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
     return value + 1;
 }
 
-bool 
+bool
 BFSWorkload::applyCondition(WorkListItem wl)
 {
     return wl.tempProp < wl.prop;
@@ -52,15 +52,20 @@ BFSWorkload::applyCondition(WorkListItem wl)
 bool
 BFSWorkload::preWBApply(WorkListItem& wl)
 {
-    wl.prop = wl.tempProp;
-    return wl.degree > 0;
+    if (applyCondition(wl)) {
+        wl.prop = wl.tempProp;
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
 }
 
-std::tuple<uint32_t, bool> 
+std::tuple<uint32_t, bool, bool>
 BFSWorkload::prePushApply(WorkListItem& wl)
 {
     uint32_t value = wl.prop;
-    return std::make_tuple(value, false);
+    return std::make_tuple(value, true, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 304b434a3d..c4db5c9e2f 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -46,7 +46,7 @@ class GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -56,7 +56,7 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(), 
+        GraphWorkload(),
         initAddr(init_addr), initValue(init_value)
     {}
 
@@ -66,7 +66,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
 };
 
 }
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index d462d618e6..1fd3b968c5 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,7 +43,7 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
-    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fa5099353e..0c223a8a5b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,16 +49,17 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
-    postApplyWBQueueSize(params.post_apply_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextApplyEvent([this] {
-        processNextApplyEvent();
-        }, name() + ".nextApplyEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -469,7 +470,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
 
+    bool do_wb = false;
     if (pkt->findNextSenderState<SenderState>()) {
         assert(!((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid)));
@@ -480,7 +483,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
         uint64_t send_mask = pendingVertexPullReads[addr];
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
@@ -489,19 +491,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
                 _workCount--;
-                owner->recvVertexPush(vertex_addr, items[i]);
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    owner->recvPrevPullCorrection();
+                }
+
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
-        delete pkt;
-        return true;
+        maxPotentialPostPushWB--;
     }
 
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
-                "fill cacheBlocks[%d].\n", __func__, block_index);
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
@@ -512,19 +525,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(!cacheBlocks[block_index].pendingApply);
         assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
-        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
-                                                peerMemoryAtomSize);
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                            __func__, block_index, i,
-                            cacheBlocks[block_index].items[i].to_string());
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                                __func__, block_index, i,
+                                cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        delete pkt;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
@@ -570,6 +594,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+    delete pkt;
     return true;
 }
 
@@ -675,8 +700,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
             if ((!applyQueue.empty()) &&
-                (!nextApplyEvent.scheduled())) {
-                schedule(nextApplyEvent, nextCycle());
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
             }
         } else {
             assert(MSHR.size() <= numMSHREntries);
@@ -742,7 +767,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 }
 
 void
-CoalesceEngine::processNextApplyEvent()
+CoalesceEngine::processNextPreWBApplyEvent()
 {
     int block_index = applyQueue.front();
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
@@ -757,27 +782,22 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
-                // TODO: Implement this function
-                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-                int bit_index_base =
-                            getBitIndexBase(cacheBlocks[block_index].addr);
-
-                if (do_push) {
-                    if (needsPush[bit_index_base + index] == 0) {
-                        _workCount++;
-                        needsPush[bit_index_base + index] = 1;
-                        activeBits.push_back(bit_index_base + index);
-                        if (!owner->running()) {
-                            owner->start();
-                        }
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    _workCount++;
+                    needsPush[bit_index_base + index] = 1;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
         }
         stats.bitvectorLength.sample(needsPush.count());
 
-        cacheBlocks[block_index].needsWB = true;
+        assert(cacheBlocks[block_index].needsWB);
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
@@ -810,8 +830,8 @@ CoalesceEngine::processNextApplyEvent()
 
     applyQueue.pop_front();
     if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
     }
 
     if (done()) {
@@ -870,16 +890,78 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].pendingData);
     assert(!cacheBlocks[block_index].pendingApply);
     assert(!cacheBlocks[block_index].pendingWB);
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-    memPort.sendPacket(pkt);
-    onTheFlyReqs++;
-
-    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
         pendingVertexPullReads.end()) {
-        stats.numDoubleMemReads++;
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
     }
 }
 
@@ -948,6 +1030,18 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -1017,6 +1111,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             assert(vertex_send_mask == 0);
             send_mask |= (1 << index_offset);
             pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_CACHE) {
             // renaming the outputs to their local names.
@@ -1030,35 +1125,39 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
 
-            // TODO: Implement a function like this.
-            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
-            // TODO: After implementing the above function get rid of this bool
-            // if (applyBeforePush) {
-            //     cacheBlocks[block_index].items[wl_offset].prop =
-            //         cacheBlocks[block_index].items[wl_offset].tempProp;
-            // }
-            // TODO: Implement recvVertexPush2 in PushEngine.
-            // owner->recvVertexPush2(vertex_addr, delta,
-            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
-            //             cacheBlocks[block_index].items[wl_offset].degree);
-            owner->recvVertexPush(
-                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
             stats.verticesPushed++;
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
-            Addr addr = location;
-            int index_offset = offset;
-            uint64_t send_mask = (1 << index_offset);
-            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-            SenderState* sender_state = new SenderState(true);
-            pkt->pushSenderState(sender_state);
-            memPort.sendPacket(pkt);
-            onTheFlyReqs++;
-            pendingVertexPullReads[addr] = send_mask;
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
         }
-        numPullsReceived--;
     }
 
     stats.bitvectorSearchStatus[bit_status]++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0a2c0ca5ff..c0091a494d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -123,14 +123,15 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
-    int postApplyWBQueueSize;
-    std::deque<WorkListItem> postApplyWBQueue;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
+    int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
@@ -140,14 +141,15 @@ class CoalesceEngine : public BaseMemoryEngine
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
     void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 70f1e05f32..b91aa21a53 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -80,9 +81,16 @@ MPU::recvWorkload(GraphWorkload* workload)
 }
 
 void
-MPU::recvVertexPush(Addr addr, WorkListItem wl)
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
 {
-    pushEngine->recvVertexPush(addr, wl);
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvPrevPullCorrection()
+{
+    DPRINTF(MPU, "%s: Fuck!\n", __func__);
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f6101c325..8f3b29f603 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,9 +75,9 @@ class MPU : public SimObject
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c54f19307f..c76567696e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -184,18 +184,18 @@ PushEngine::processNextVertexPullEvent()
 }
 
 void
-PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
 {
-    assert(wl.degree > 0);
+    assert(degree > 0);
     assert((edgePointerQueueSize == 0) ||
             ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
 
-    // uint32_t value = calculateValue(wl);
-    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
 
     numPendingPulls--;
@@ -207,6 +207,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+
 }
 
 void
@@ -229,7 +230,7 @@ PushEngine::processNextMemoryReadEvent()
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1112176897..848c93e313 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -66,21 +66,24 @@ class PushEngine : public BaseMemoryEngine
 
     class EdgeReadInfoGen {
       private:
+        Addr _src;
+        uint32_t _delta;
+
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
 
-        Addr _src;
-        uint32_t _value;
-
       public:
-        EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _src(src), _value(value)
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
         {}
 
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
@@ -105,9 +108,6 @@ class PushEngine : public BaseMemoryEngine
         }
 
         bool done() { return (_start >= _end); }
-
-        Addr src() { return _src; }
-        uint32_t value() { return _value; }
     };
     struct PushInfo {
         Addr src;
@@ -197,8 +197,7 @@ class PushEngine : public BaseMemoryEngine
 
     void start();
     bool running() { return _running; }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();

From 31458d9802768f15ed4ea65e2ab2abfaf4f4b21d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Oct 2022 08:40:47 -0700
Subject: [PATCH 197/279] Implementing correction function for PushEngine.

---
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 src/accl/graph/sega/mpu.cc             | 2 +-
 src/accl/graph/sega/push_engine.cc     | 9 +++++++++
 src/accl/graph/sega/push_engine.hh     | 1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0c223a8a5b..441457f2e8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -501,9 +501,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     owner->recvVertexPush(vertex_addr, delta,
                                         items[i].edgeIndex, items[i].degree);
                 } else {
+                    // TODO: Add a stat to count this.
                     owner->recvPrevPullCorrection();
                 }
-
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
@@ -548,7 +548,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             schedule(nextMemoryEvent, nextCycle());
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
+        // TODO: Add a stat to count this.
+        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b91aa21a53..b30060238d 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -90,7 +90,7 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
 void
 MPU::recvPrevPullCorrection()
 {
-    DPRINTF(MPU, "%s: Fuck!\n", __func__);
+    pushEngine->recvPrevPullCorrection();
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c76567696e..07f37a28dc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -207,7 +207,16 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+}
 
+void
+PushEngine::recvPrevPullCorrection()
+{
+    assert(numPendingPulls > 0);
+    numPendingPulls--;
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 848c93e313..2e1de25390 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,7 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvReqRetry();
 

From a0064f14673301202f2ca928146f138c3e87082e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 19 Oct 2022 08:03:16 -0700
Subject: [PATCH 198/279] Adding initialization to graphWorkloads

---
 configs/accl/sega-hbm.py                   |  4 +-
 src/accl/graph/base/data_structs.hh        |  2 +
 src/accl/graph/base/graph_workload.cc      | 72 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      | 44 +++++++++++--
 src/accl/graph/sega/centeral_controller.cc | 22 ++-----
 src/accl/graph/sega/centeral_controller.hh |  7 ++-
 src/accl/graph/sega/coalesce_engine.cc     |  3 +-
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 8 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 50fd5f3069..9078c185f3 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -57,7 +57,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
                                             max_resp_per_cycle=8,
-                                            post_apply_wb_queue_size=64
+                                            post_push_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -136,7 +136,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-        
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 2d81375b63..70babf5960 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -37,6 +37,8 @@
 #include <cstring>
 #include <list>
 
+#define MAX_BITVECTOR_SIZE (1 << 28)
+
 namespace gem5
 {
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6a8e000515..542f2e0221 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -31,6 +31,37 @@
 namespace gem5
 {
 
+BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
+    GraphWorkload(), initValue(init_value), atomSize(atom_size)
+{
+    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
+    initIndex = (init_addr - initAddrBase) / atomSize;
+    numElementsPerLine = atomSize / sizeof(WorkListItem);
+}
+
+
+void
+BFSWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::deque<int>& activeBits)
+{
+    if (pkt->getAddr() == initAddrBase) {
+        WorkListItem items[numElementsPerLine];
+
+        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+
+        items[initIndex].tempProp = initValue;
+        items[initIndex].prop = initValue;
+        needsPush[bit_index_base + initIndex] = 1;
+        activeBits.push_back(bit_index_base + initIndex);
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+    }
+
+}
+
 uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
@@ -68,4 +99,45 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return update+value;
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return (alpha*value*weight);
+}
+
+bool
+PRWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp != wl.prop;
+}
+
+bool
+PRWorkload::preWBApply(WorkListItem& wl)
+{
+    if (applyCondition(wl)) {
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::tuple<uint32_t, bool, bool>
+PRWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    if (delta > threshold) {
+        return std::make_tuple(delta, true, true);
+    }
+    uint32_t value = wl.tempProp;
+    return std::make_tuple(value, false, false);
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c4db5c9e2f..cc0767305a 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -29,9 +29,13 @@
 #ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 #define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 
+#include <bitset>
+#include <deque>
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
+#include "mem/packet.hh"
 
 
 namespace gem5
@@ -42,6 +46,10 @@ class GraphWorkload
   public:
     GraphWorkload() {}
     ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -52,16 +60,42 @@ class GraphWorkload
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddr;
+    uint64_t initAddrBase;
+    int initIndex;
     uint32_t initValue;
+    int numElementsPerLine;
+    int atomSize;
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(),
-        initAddr(init_addr), initValue(init_value)
-    {}
+    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+};
+
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+  public:
+    PRWorkload(float alpha, float threshold):
+        GraphWorkload(), alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fd282834e9..dbd1705e8a 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -51,12 +51,13 @@ CenteralController::CenteralController(const Params& params):
 }
 
 void
-CenteralController::initState()
+CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->recvWorkload(workload);
     }
+
     const auto& file = params().image_file;
     if (file == "")
         return;
@@ -79,22 +80,11 @@ CenteralController::initState()
     }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
-}
 
-void
-CenteralController::startup()
-{
-    while(!initialUpdates.empty()) {
-        PacketPtr front = initialUpdates.front();
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(front->getAddr())) {
-                    mpu->handleIncomingUpdate(front);
-                }
-            }
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount ()> 0)) {
+            mpu->start();
         }
-        initialUpdates.pop_front();
     }
 }
 
@@ -140,7 +130,7 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value);
+    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f1df00b4b..4c5ff28ebe 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -48,8 +48,6 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
-    GraphWorkload* workload;
-
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -60,10 +58,13 @@ class CenteralController : public ClockedObject
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
+
+    GraphWorkload* workload;
+
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
 
-    virtual void initState() override;
+    // virtual void initState() override;
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 441457f2e8..b91b92c0fb 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -121,7 +121,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         }
     } else {
         // TODO: Add and implement init function for GraphWorkload.
-        // graphWorkload->init(pkt);
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c0091a494d..926caf46db 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -38,7 +38,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
+
 
 namespace gem5
 {

From 2ea00f9c5e724e191704a1666913c82236b8b7e0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 22 Oct 2022 12:36:32 -0700
Subject: [PATCH 199/279] Fixing algo start issue.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 2 +-
 src/accl/graph/sega/coalesce_engine.hh     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index dbd1705e8a..61ad7c10b4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,7 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount ()> 0)) {
+        if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b91b92c0fb..72ceba6f89 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1079,7 +1079,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if (cacheBlocks[block_index].addr != addr) {
+            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 926caf46db..8c187f8fb8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -209,7 +209,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return _workCount; }
+    int workCount() { return needsPush.count(); }
     void recvVertexPull();
 
     bool done();

From 0ef4d5eba30c2f65144171c25a67988894d69baf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 22 Oct 2022 13:49:41 -0700
Subject: [PATCH 200/279] Fixing block addr initialization.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 72ceba6f89..5b5374873c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -267,7 +267,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
-        // assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8c187f8fb8..e710553be1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -73,7 +73,7 @@ class CoalesceEngine : public BaseMemoryEngine
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
-          addr(0),
+          addr(-1),
           busyMask(0),
           valid(false),
           needsApply(false),

From aa6fb7d0ed793520325eb119dce28b81642bc290 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 21:43:33 -0700
Subject: [PATCH 201/279] Adding PR.

---
 src/accl/graph/base/graph_workload.cc      | 48 ++++++++++++++++++----
 src/accl/graph/base/graph_workload.hh      | 15 ++++---
 src/accl/graph/sega/CenteralController.py  |  3 +-
 src/accl/graph/sega/centeral_controller.cc | 32 +--------------
 src/accl/graph/sega/centeral_controller.hh |  8 +---
 src/accl/graph/sega/coalesce_engine.cc     | 27 ++----------
 src/accl/graph/sega/coalesce_engine.hh     |  3 --
 7 files changed, 57 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 542f2e0221..cbaef86a76 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -36,13 +36,13 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 {
     initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
     initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = atomSize / sizeof(WorkListItem);
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
 }
 
 
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                 std::deque<int>& activeBits)
 {
     if (pkt->getAddr() == initAddrBase) {
@@ -99,23 +99,53 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+{
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+}
+
+void
+PRWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+                std::deque<int>& activeBits)
+{
+    WorkListItem items[numElementsPerLine];
+
+    pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    for (int i = 0; i < numElementsPerLine; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        needsPush[bit_index_base + i] = 1;
+        activeBits.push_back(bit_index_base + i);
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, atomSize);
+}
 
 uint32_t
 PRWorkload::reduce(uint32_t update, uint32_t value)
 {
-    return update+value;
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
 }
 
 uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
-    return (alpha*value*weight);
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
 bool
 PRWorkload::applyCondition(WorkListItem wl)
 {
-    return wl.tempProp != wl.prop;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return temp_float != prop_float;
 }
 
 bool
@@ -132,12 +162,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
 {
-    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = abs((temp_float - prop_float) / wl.degree);
     if (delta > threshold) {
+        wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
-    uint32_t value = wl.tempProp;
-    return std::make_tuple(value, false, false);
+    return std::make_tuple(0, false, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index cc0767305a..831da97e71 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,7 @@ class GraphWorkload
     ~GraphWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
@@ -65,13 +65,14 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
     int numElementsPerLine;
     int atomSize;
+
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
@@ -86,15 +87,17 @@ class PRWorkload : public GraphWorkload
   private:
     float alpha;
     float threshold;
+
+    int numElementsPerLine;
+    int atomSize;
+
   public:
-    PRWorkload(float alpha, float threshold):
-        GraphWorkload(), alpha(alpha), threshold(threshold)
-    {}
+    PRWorkload(float alpha, float threshold, int atom_size);
 
     ~PRWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 17badf9ec4..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,8 +42,7 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
-                    PyBindMethod("createInitialBFSUpdate"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createInitialPRUpdate"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 61ad7c10b4..57198450d4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -103,30 +103,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-template<typename T> PacketPtr
-CenteralController::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) value) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
-{
-    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
-    initialUpdates.push_back(update);
-}
-
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
@@ -134,13 +110,9 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 }
 
 void
-CenteralController::createInitialPRUpdate()
+CenteralController::createPRWorkload(float alpha, float threshold)
 {
-    for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount() > 0)) {
-            mpu->start();
-        }
-    }
+    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4c5ff28ebe..9ddb1b35f0 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -49,13 +49,11 @@ class CenteralController : public ClockedObject
     System* system;
 
     Addr maxVertexAddr;
-    std::deque<PacketPtr> initialUpdates;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
 
@@ -63,13 +61,11 @@ class CenteralController : public ClockedObject
 
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
-    // virtual void initState() override;
     virtual void startup() override;
 
-    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createInitialPRUpdate();
+    void createPRWorkload(float alpha, float threshold);
+
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5b5374873c..e71cc1195f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,8 +48,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
-    postPushWBQueueSize(params.post_push_wb_queue_size),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -76,25 +75,6 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-// void
-// CoalesceEngine::algoInit(PacketPtr pkt)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     if(workload == "PR") {
-//         //TODO: Add Alpha
-//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-//         int bit_index_base = getBitIndexBase(pkt->getAddr());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             items[i].tempProp = readFromFloat<uint32_t>(0);
-//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//         }
-//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-//     }
-// }
-
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -491,7 +471,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
-                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -550,6 +529,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     } else {
         // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
         DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
@@ -788,7 +769,6 @@ CoalesceEngine::processNextPreWBApplyEvent()
             if (do_push) {
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
-                    _workCount++;
                     needsPush[bit_index_base + index] = 1;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
@@ -1125,7 +1105,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e710553be1..c8fec38e5b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -116,9 +116,6 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    bool applyBeforeWB;
-    bool applyBeforePush;
-    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;

From ac64518be820c2e1b6561cfa22d7ec725d6bcfb1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 22:14:05 -0700
Subject: [PATCH 202/279] Prepping for PR.

---
 configs/accl/sega-hbm.py               | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 9078c185f3..1c9276f0a0 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -134,12 +134,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
 
@@ -169,8 +169,8 @@ def get_inputs():
 
     m5.instantiate()
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    system.create_bfs_workload(init_addr, init_value)
+    # system.create_bfs_workload(init_addr, init_value)
+    system.create_pr_workload(0.2, 0.0000001)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e71cc1195f..2d5445093a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -577,6 +577,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+
+    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }

From a839eaade7b4a686e379d7d641146637a33beca6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 25 Oct 2022 13:52:56 -0700
Subject: [PATCH 203/279] Adding print function to GraphWorkload class.

---
 src/accl/graph/base/data_structs.hh        | 21 -----------
 src/accl/graph/base/graph_workload.cc      | 44 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      |  4 +-
 src/accl/graph/sega/centeral_controller.cc |  4 +-
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 70babf5960..d9028e2f10 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstring>
 #include <list>
 
 #define MAX_BITVECTOR_SIZE (1 << 28)
@@ -181,26 +180,6 @@ class UniqueFIFO
     }
 };
 
-template<typename T>
-float
-writeToFloat(T value)
-{
-    assert(sizeof(T) == sizeof(float));
-    float float_form;
-    std::memcpy(&float_form, &value, sizeof(float));
-    return float_form;
-}
-
-template<typename T>
-T
-readFromFloat(float value)
-{
-    assert(sizeof(T) == sizeof(float));
-    T float_bits;
-    std::memcpy(&float_bits, &value, sizeof(float));
-    return float_bits;
-}
-
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index cbaef86a76..ead32c0eb8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,9 +28,34 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
 namespace gem5
 {
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
     GraphWorkload(), initValue(init_value), atomSize(atom_size)
 {
@@ -99,6 +124,15 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
 {
@@ -172,4 +206,14 @@ PRWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(0, false, false);
 }
 
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            temp_float, temp_float, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 831da97e71..c391a80c23 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,7 +34,6 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
-#include "base/intmath.hh"
 #include "mem/packet.hh"
 
 
@@ -55,6 +54,7 @@ class GraphWorkload
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -79,6 +79,7 @@ class BFSWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
@@ -104,6 +105,7 @@ class PRWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 57198450d4..fc2262e111 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -144,8 +144,8 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
-                                        addr, i, items[i].to_string());
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
 
             std::cout << print << std::endl;
         }

From 76cf9de16d6c0ab6e81a36c45e785236ec0d9b79 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 25 Oct 2022 16:48:11 -0700
Subject: [PATCH 204/279] Updating PR

---
 src/accl/graph/base/graph_workload.cc  | 36 +++++++++--------
 src/accl/graph/sega/coalesce_engine.cc | 53 ++++++++++++++++----------
 src/accl/graph/sega/wl_engine.cc       | 10 ++---
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ead32c0eb8..9f7e5fc4c5 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -77,8 +77,10 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
 
         items[initIndex].tempProp = initValue;
         items[initIndex].prop = initValue;
-        needsPush[bit_index_base + initIndex] = 1;
-        activeBits.push_back(bit_index_base + initIndex);
+        if (items[initIndex].degree > 0) {
+            needsPush[bit_index_base + initIndex] = 1;
+            activeBits.push_back(bit_index_base + initIndex);
+        }
 
         pkt->deleteData();
         pkt->allocate();
@@ -150,8 +152,10 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
     for (int i = 0; i < numElementsPerLine; i++) {
         items[i].tempProp = readFromFloat<uint32_t>(0);
         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        needsPush[bit_index_base + i] = 1;
-        activeBits.push_back(bit_index_base + i);
+        if (items[i].degree > 0) {
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
     }
     pkt->deleteData();
     pkt->allocate();
@@ -170,7 +174,7 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
+    float weight_float = writeToFloat<uint32_t>(1);
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -179,27 +183,27 @@ PRWorkload::applyCondition(WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    return temp_float != prop_float;
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
 }
 
 bool
 PRWorkload::preWBApply(WorkListItem& wl)
 {
-    if (applyCondition(wl)) {
-        if (wl.degree > 0) {
-            return true;
-        }
+    if (applyCondition(wl) && (wl.degree > 0)) {
+        return true;
     }
     return false;
 }
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = abs((temp_float - prop_float) / wl.degree);
-    if (delta > threshold) {
+{ 
+    if (applyCondition(wl)) {
+        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+        float prop_float = writeToFloat<uint32_t>(wl.prop);
+        float delta = (temp_float - prop_float) / wl.degree;
+        std::cout << "PRWorkload: delta: " << delta << std::endl;
         wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
@@ -211,7 +215,7 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, temp_float, wl.degree, wl.edgeIndex
             );
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2d5445093a..0d1eecf43f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -157,7 +157,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
+                        block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
@@ -176,15 +176,17 @@ CoalesceEngine::recvWLRead(Addr addr)
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -476,6 +478,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
+                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -508,8 +511,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
             DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                                __func__, block_index, i,
-                                cacheBlocks[block_index].items[i].to_string());
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
@@ -550,12 +553,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -603,7 +608,9 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, worklist_response.to_string(), addr_response);
+                    __func__, 
+                    graphWorkload->printWorkListItem(worklist_response), 
+                    addr_response);
 
         responseQueue.pop_front();
         DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
@@ -640,12 +647,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
                         "wl: %s. This request maps to cacheBlocks[%d], "
                         "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, wl.to_string(),
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+                "with Addr: %lu.\n", __func__, 
+                graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
@@ -666,13 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
-                cacheBlocks[block_index].items[wl_offset].to_string());
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
@@ -899,12 +909,14 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     // TODO: Add a stat to count the number of WLItems that have been touched.
                     cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -1061,7 +1073,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
+            } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
@@ -1112,6 +1124,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
+            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 85fe9be2ca..a698f2cc0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -263,10 +263,10 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     workListFile[addr] = wl;
     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
 
     stats.vertexReadLatency.sample(
         ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
@@ -287,13 +287,13 @@ WLEngine::processNextReduceEvent()
         uint32_t update_value = registerFile[addr];
         DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
                     ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-                                        __func__, addr, registerFile[addr],
-                                        addr, workListFile[addr].to_string());
+                    __func__, addr, registerFile[addr], addr,
+                    graphWorkload->printWorkListItem(workListFile[addr]));
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-                            __func__, addr, workListFile[addr].to_string());
+        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
         stats.numReduce++;
 
         owner->recvWLWrite(addr, workListFile[addr]);

From 89521874e813dc04fc39304083631ac7c6851999 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:11:05 -0700
Subject: [PATCH 205/279] Updating configs for pr and bfs. Fixing bugs for pr.

---
 configs/accl/bfs.py                    |  78 +++++++++++
 configs/accl/pr.py                     |  78 +++++++++++
 configs/accl/real-graph-gen.py         |  41 ++++--
 configs/accl/sega-hbm.py               | 178 -------------------------
 configs/accl/sega.py                   | 137 +++++++++----------
 configs/accl/synth-graph-gen.py        |  88 ++++++++----
 src/accl/graph/base/graph_workload.cc  |  10 +-
 src/accl/graph/sega/coalesce_engine.cc |  24 ++--
 8 files changed, 332 insertions(+), 302 deletions(-)
 create mode 100644 configs/accl/bfs.py
 create mode 100644 configs/accl/pr.py
 delete mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..d02faa96ca
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=float)
+    argparser.add_argument("init_value", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_bfs_workload(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..59e8b924c6
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index db44c63a9a..b943a925c1 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -28,14 +28,20 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.path, args.num_gpts
 
+
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
@@ -59,16 +65,29 @@ def get_inputs():
         print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
         print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_dir}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_dir}/binaries/gpts_{num_gpts}"
+        )
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
deleted file mode 100644
index 1c9276f0a0..0000000000
--- a/configs/accl/sega-hbm.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret, intlv_low_bit + intlv_bits - 1
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8,
-                                            post_push_wb_queue_size=64
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=512,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
-                                        dram_2=HBM_2000_4H_1x64())
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
-        vertex_ranges, pch_bit = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            2*num_mpus,
-                                            32
-                                            )
-
-        gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
-            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument("--verify", type=bool, help="Print final answer")
-
-    args = argparser.parse_args()
-
-    verify = False
-    if not args.verify is None:
-        verify = args.verify
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value, verify
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    # system.create_bfs_workload(init_addr, init_value)
-    system.create_pr_workload(0.2, 0.0000001)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c50c525297..42c07e2e94 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -24,100 +24,111 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
                 start=plain_range.start,
                 size=plain_range.size(),
                 intlvHighBit=intlv_low_bit + intlv_bits - 1,
                 xorHighBit=0,
                 intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            num_mshr_entry=64,
+            num_tgts_per_mshr=64,
+            max_resp_per_cycle=8,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
+            Xpush_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=512,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
+        )
+
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False
+            )
+        )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.dram.range = vertex_range
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
+
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
@@ -128,31 +139,11 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
index 16985b3537..15e4a6eff2 100644
--- a/configs/accl/synth-graph-gen.py
+++ b/configs/accl/synth-graph-gen.py
@@ -28,15 +28,27 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
-    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "scale", type=int, help="The scale of the synth graph to generate."
+    )
+    argparser.add_argument(
+        "deg",
+        type=int,
+        help="The average degree of the synth graph to generate.",
+    )
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.scale, args.deg, args.num_gpts
 
+
 if __name__ == "__main__":
     scale, deg, num_gpts = get_inputs()
 
@@ -62,18 +74,27 @@ def get_inputs():
         for delete in os.scandir(graph_path):
             os.remove(delete.path)
         print(f"Deleted everything in {graph_path}")
-        subprocess.run([f"{graph_gen}",
-                        f"{scale}",
-                        f"{deg}",
-                        f"{graph_path}/graph_unordered.txt"])
-        print(f"Generated a graph with scale "
-            f"{scale} and deg {deg}")
-        subprocess.run(["python",
-                        f"{graph_sorter}",
-                        f"{graph_path}/graph_unordered.txt",
-                        f"{graph_path}/graph.txt"])
-        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(
+            [
+                f"{graph_gen}",
+                f"{scale}",
+                f"{deg}",
+                f"{graph_path}/graph_unordered.txt",
+            ]
+        )
+        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}/graph_unordered.txt",
+                f"{graph_path}/graph.txt",
+            ]
+        )
+        print(
+            f"Sorted the graph here {graph_path}/graph_unordered.txt"
+            f" and saved in {graph_path}/graph.txt"
+        )
         subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
         print(f"Deleted {graph_path}/graph_unordered.txt")
 
@@ -88,16 +109,31 @@ def get_inputs():
         print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
-        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}/graph.txt",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_path}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(
+            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
+        )
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}/graph.txt",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_path}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_path}/binaries/gpts_{num_gpts}"
+        )
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9f7e5fc4c5..e362d605c0 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -174,7 +174,9 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(1);
+    float weight_float = 1.0;
+    float delta = alpha * value_float * weight_float;
+
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -198,14 +200,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{ 
+{
     if (applyCondition(wl)) {
         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
         float prop_float = writeToFloat<uint32_t>(wl.prop);
         float delta = (temp_float - prop_float) / wl.degree;
-        std::cout << "PRWorkload: delta: " << delta << std::endl;
+        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
         wl.prop = wl.tempProp;
-        return std::make_tuple(delta, true, true);
+        return std::make_tuple(delta_uint, true, true);
     }
     return std::make_tuple(0, false, false);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0d1eecf43f..2f6555602c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -196,7 +196,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
         // HACK: If a read happens on the same cycle as another operation such
-        // apply setLastChangedTick to half a cycle later so that operations
+        // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
         // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
         cacheBlocks[block_index].lastChangedTick =
@@ -478,7 +478,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
-                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -517,7 +516,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() + (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -564,7 +566,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            cacheBlocks[block_index].lastChangedTick = curTick();
+            // cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             it = MSHR[block_index].erase(it);
@@ -608,8 +610,8 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, 
-                    graphWorkload->printWorkListItem(worklist_response), 
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
                     addr_response);
 
         responseQueue.pop_front();
@@ -652,7 +654,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, 
+                "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
@@ -874,8 +876,11 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
-    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-    //
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
 
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
@@ -1124,7 +1129,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
-            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,

From c12acb898baaf3fdf67d0c5c28686281dcec4d6e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:46:18 -0700
Subject: [PATCH 206/279] Fixing typos.

---
 configs/accl/bfs.py                   | 8 ++++----
 configs/accl/sega.py                  | 2 +-
 src/accl/graph/base/graph_workload.cc | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index d02faa96ca..fc32b96642 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -37,8 +37,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=float)
-    argparser.add_argument("init_value", type=float)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -54,8 +54,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
-        args.alpha,
-        args.threshold,
+        args.init_addr,
+        args.init_value,
         args.verify,
     )
 
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 42c07e2e94..0f4b133791 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -59,7 +59,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
-            Xpush_req_queue_size=32,
+            push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=512,
             update_queue_size=32,
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e362d605c0..44136cb4c1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -175,7 +175,6 @@ PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
     float weight_float = 1.0;
-    float delta = alpha * value_float * weight_float;
 
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }

From ae8729154ab4c500360c19d7c3cb3b5d895984f3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 27 Oct 2022 14:24:18 -0700
Subject: [PATCH 207/279] Adding sample script.

---
 configs/accl/pr-sample.py              | 109 +++++++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 2 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
new file mode 100644
index 0000000000..ac3616dc84
--- /dev/null
+++ b/configs/accl/pr-sample.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 10us",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+        args.sample,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        verify,
+        sample,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2f6555602c..1dbe2a0d56 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -519,7 +519,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // HACK: In case processNextRead is called on the same tick as curTick
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
-                                        curTick() + (Tick) (clockPeriod() / 2);
+                                        curTick() - (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);

From f0dc01eb09cb81a4903971126b47faacfedd0681 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 28 Oct 2022 11:02:32 -0700
Subject: [PATCH 208/279] Fixing sim performance issue.

---
 src/accl/graph/base/graph_workload.cc  |  8 ++++++--
 src/accl/graph/base/graph_workload.hh  |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  7 +++++--
 src/accl/graph/sega/coalesce_engine.hh | 18 ++++++++++++++++--
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 44136cb4c1..07accff44f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -68,7 +68,8 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     if (pkt->getAddr() == initAddrBase) {
         WorkListItem items[numElementsPerLine];
@@ -80,6 +81,7 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[initIndex].degree > 0) {
             needsPush[bit_index_base + initIndex] = 1;
             activeBits.push_back(bit_index_base + initIndex);
+            _workCount++;
         }
 
         pkt->deleteData();
@@ -144,7 +146,8 @@ PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
 void
 PRWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     WorkListItem items[numElementsPerLine];
 
@@ -155,6 +158,7 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[i].degree > 0) {
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
+            _workCount++;
         }
     }
     pkt->deleteData();
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c391a80c23..6bbc4935c2 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,8 @@ class GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits) = 0;
+                    std::deque<int>& activeBits,
+                    int& _workCount) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -73,7 +74,8 @@ class BFSWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
@@ -99,7 +101,8 @@ class PRWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1dbe2a0d56..38f05f937a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,7 +47,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
     numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
@@ -102,7 +102,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     } else {
         // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
 }
@@ -473,6 +473,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -784,6 +785,7 @@ CoalesceEngine::processNextPreWBApplyEvent()
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
                     needsPush[bit_index_base + index] = 1;
+                    _workCount++;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
                         owner->start();
@@ -1124,6 +1126,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c8fec38e5b..64c5c4af46 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -52,6 +52,17 @@ enum BitStatus
     NUM_STATUS
 };
 
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_PRE_WB_APPLY,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -69,6 +80,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingApply;
         bool pendingWB;
         Tick lastChangedTick;
+        CacheState state;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -81,7 +93,8 @@ class CoalesceEngine : public BaseMemoryEngine
           pendingData(false),
           pendingApply(false),
           pendingWB(false),
-          lastChangedTick(0)
+          lastChangedTick(0),
+          state(CacheState::INVALID)
         {
           items = new WorkListItem [num_elements];
         }
@@ -116,6 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
@@ -206,7 +220,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return needsPush.count(); }
+    int workCount() { return _workCount; }
     void recvVertexPull();
 
     bool done();

From b5a8075df73c74ff3717791c1f48f5bfc3fe8fb5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Oct 2022 09:53:00 -0700
Subject: [PATCH 209/279] Fixing write miss issue.

---
 src/accl/graph/sega/coalesce_engine.cc | 92 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh | 30 ++++++++-
 2 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 38f05f937a..7a064c1c2f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -495,6 +495,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         maxPotentialPostPushWB--;
     }
 
+    bool cache_wb = false;
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                         "fill cacheBlocks[%d].\n", __func__, block_index);
@@ -521,6 +522,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
                                         curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -537,42 +539,44 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // TODO: Add a stat to count this.
         // FIXME: This is not a totally wasteful read. e.g. all reads
         // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
     }
 
-    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-        Addr miss_addr = *it;
-        Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-        if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                        "cacheBlocks[%d] can be serviced with the received "
-                        "packet.\n",__func__, miss_addr, block_index);
-            // TODO: Make this block of code into a function
-            responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset], curTick()));
-            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, miss_addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            // cacheBlocks[block_index].lastChangedTick = curTick();
-            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-            it = MSHR[block_index].erase(it);
-        } else {
-            it++;
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
         }
     }
 
@@ -1045,7 +1049,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<BitStatus, Addr, int>
+std::tuple<WorkLocation, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
     int visited_bits = 0;
@@ -1066,7 +1070,7 @@ CoalesceEngine::getOptimalPullAddr()
             assert(vertex_send_mask == 0);
             activeBits.pop_front();
             return std::make_tuple(
-                                BitStatus::PENDING_READ, addr, index_offset);
+                                WorkLocation::PENDING_READ, addr, index_offset);
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -1078,12 +1082,12 @@ CoalesceEngine::getOptimalPullAddr()
                 assert(!cacheBlocks[block_index].pendingData);
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_CACHE, block_index, index_offset);
+                            WorkLocation::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_MEMORY, addr, index_offset);
+                            WorkLocation::IN_MEMORY, addr, index_offset);
             }
         }
         activeBits.pop_front();
@@ -1091,20 +1095,20 @@ CoalesceEngine::getOptimalPullAddr()
         visited_bits++;
     }
 
-    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    BitStatus bit_status;
+    WorkLocation bit_status;
     Addr location;
     int offset;
 
     std::tie(bit_status, location, offset) = getOptimalPullAddr();
 
-    if (bit_status != BitStatus::GARBAGE) {
-        if (bit_status == BitStatus::PENDING_READ) {
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
             // renaming the outputs to thier local names.
             Addr addr = location;
             int index_offset = offset;
@@ -1116,7 +1120,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_CACHE) {
+        if (bit_status == WorkLocation::IN_CACHE) {
             // renaming the outputs to their local names.
             int block_index = (int) location;
             int wl_offset = offset;
@@ -1145,7 +1149,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_MEMORY) {
+        if (bit_status == WorkLocation::IN_MEMORY) {
             if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
                 Addr addr = location;
                 int index_offset = offset;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 64c5c4af46..05e268270a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-enum BitStatus
+enum WorkLocation
 {
     PENDING_READ,
     IN_CACHE,
@@ -65,6 +65,32 @@ enum CacheState
 
 class MPU;
 
+
+// TODO: Add active bit to WorkListItem class. Check active bit before activate
+// Only activate if necessary and not active before.
+class WorkDirectory
+{
+  private:
+    CoalesceEngine* owner;
+    Addr memoryAtomSize;
+    int atomBlockSize;
+    size_t elementSize;
+
+    int _workCount;
+  public:
+    AddrRange memoryRange;
+    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
+        memoryAtomSize(atom_size), atomBlockSize(block_size),
+        elementSize(element_size), _workCount(0)
+    {}
+
+    void activate(Addr addr);
+    void deactivate(Addr addr);
+    int workCount();
+    std::tuple<WorkLocation, Addr> getNextWork();
+
+};
+
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -140,7 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
 
     int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to

From e459799e0a5b604c61cbacaca88ee270d0d09fe9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 1 Nov 2022 00:15:16 -0700
Subject: [PATCH 210/279] Restructuring the cache.

---
 src/accl/graph/base/data_structs.hh        |   17 +-
 src/accl/graph/sega/CoalesceEngine.py      |    2 -
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 +
 src/accl/graph/sega/coalesce_engine.cc     |  553 +++------
 src/accl/graph/sega/coalesce_engine.hh     |  107 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 ++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ++++
 7 files changed, 1834 insertions(+), 421 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d9028e2f10..070e635736 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -45,29 +45,33 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 31;
+    bool active: 1;
 
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        tempProp, prop, degree, edgeIndex);
+                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
+                active ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        active(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
+                uint32_t edge_index, uint32_t degree, bool active):
         tempProp(temp_prop),
         prop(prop),
+        edgeIndex(edge_index),
         degree(degree),
-        edgeIndex(edge_index)
+        active(active)
     {}
 
 };
@@ -88,7 +92,6 @@ struct __attribute__ ((packed)) Edge
         weight(weight),
         neighbor(neighbor)
     {}
-
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 1fd3b968c5..8ec9214b49 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -38,8 +38,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     num_mshr_entry = Param.Int("Number of MSHR entries.")
 
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
new file mode 100644
index 0000000000..1fd3b968c5
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine_bak.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a064c1c2f..66ff66c068 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
+    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
+    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextPreWBApplyEvent([this] {
         processNextPreWBApplyEvent();
         }, name() + ".nextPreWBApplyEvent"),
+    nextPrePushApplyEvent([this] {
+        processNextPrePushApplyEvent();
+        }, name() + ".nextPrePushApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -66,7 +69,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-    needsPush.reset();
 }
 
 void
@@ -83,15 +85,10 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
+        // TODO: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
 
             pkt->makeResponse();
             pkt->setDataFromBlock(
@@ -100,8 +97,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
+        // FIXME: Pass workdirectory to graphworkload.init
         graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
@@ -110,6 +107,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 bool
 CoalesceEngine::done()
 {
+    // FIXME: Fix this later
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -123,6 +121,8 @@ CoalesceEngine::getBlockIndex(Addr addr)
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
+// FIXME: This and the next function should be moved to the
+// WorkDirectory.
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
@@ -134,6 +134,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
     return atom_index * block_bits;
 }
 
+// FIXME: Read FIXME: Above
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
@@ -161,17 +162,10 @@ CoalesceEngine::recvWLRead(Addr addr)
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        // Hit
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
         responseQueue.push_back(std::make_tuple(
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
@@ -189,12 +183,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].state = CacheState::BUSY;
         // HACK: If a read happens on the same cycle as another operation such
         // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
@@ -210,34 +199,20 @@ CoalesceEngine::recvWLRead(Addr addr)
         stats.numVertexReads++;
         return true;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
                                                         __func__, addr);
         stats.readHitUnderMisses++;
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
+        assert(!cacheBlocks[block_index].dirty);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
         MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
@@ -245,195 +220,52 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
         assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
+                            processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
+                } else {
+                    // NOTE: move the cache block to invalid state
+                    // FIXME: Fix the issue below.
+                    // May need to activate tracking for this
+                    cacheBlocks[block_index].reset();
                 }
             }
+            // return int instead of bool to tell WLEngine to whether
+            // roll the first entry in the queue.
+            return false;
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            if (MSHR.size() < numMSHREntries) {
+                cacheBlocks[block_index].addr = aligned_addr;
+                cacheBlocks[block_index].busyMask = 0;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].dirty = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].needsPreWBApply = false;
+                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextRead(block_index, schedule_tick);
+                    }, block_index, curTick());
+                return true;
+            } else {
                 return false;
             }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
         }
     }
 }
@@ -589,8 +421,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
-
-    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }
@@ -771,15 +601,53 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextPreWBApplyEvent()
 {
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+    int block_index = preWBApplyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
             __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].needsPreWBApply);
+        bool block_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            block_active |= active;
+            if (active) {
+                // cacheWorkCount++;
+                // FUTUREME: When pulling from activeCacheBlocks, in case we
+                // face a block that is not in idle state, we basically pop
+                // that entry and push it to the back. We only delete entries
+                // in this buffer if pushed or evicted.
+                activeCacheBlocks.push_back(block_index);
+            }
+        }
+        if (block_active && !owner->running()) {
+            owner->start();
+        }
+
+        cacheBlocks[block_index].needsPreWBApply = false;
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+            } else {
+                // FIXME: Solve below issue.
+                // Not dirty but could be active still.
+                // need to activate tracking
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+        }
+        cacheBlocks[block_index].lastChangedTick = curTick();
+    } else {
+
+    }
 
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
@@ -883,77 +751,85 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
     // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
+    // TODO: Figure out if this is still necessary.
     if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
         return;
     }
 
-    assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(!cacheBlocks[block_index].needsPreWBApply);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
     for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
+            cacheBlocks[block_index].dirty = true;
+            need_send_pkt = false;
             postPushWBQueue.erase(wb);
+        }
+    }
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
             need_send_pkt = false;
+            activeBuffer.erase(ab);
         }
     }
+    if (!need_send_pkt) {
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsPreWBApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
 
     if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
+                                                pendingVertexPullReads.end()) {
         need_send_pkt = false;
     }
 
@@ -964,11 +840,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
     }
 }
 
@@ -979,19 +850,27 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
+
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
         assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
+        Addr base_addr = cacheBlocks[block_index].addr;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            if (cacheBlocks[block_index].items[index].active) {
+                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
+                // NOTE: Implement this
+                // workdir.activate()
+                // cacheWorkCount--;
+            }
+        }
+        if (activeCacheBlocks.find(block_index)) {
+            activeCacheBlocks.erase(block_index);
+        }
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -999,30 +878,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
+        cacheBlocks[block_index].reset();
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -1049,55 +905,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
@@ -1262,8 +1069,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05e268270a..8da67c7b43 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -60,9 +60,26 @@ enum CacheState
     IDLE,
     PENDING_PRE_WB_APPLY,
     PENDING_WB,
+    PENDING_PRE_PUSH_APPLY,
     NUM_CACHE_STATE
 };
 
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_PRE_WB_APPLY",
+    "PENDING_WB",
+    "PENDING_PRE_PUSH_APPLY"
+};
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH
+};
+
 class MPU;
 
 
@@ -71,7 +88,6 @@ class MPU;
 class WorkDirectory
 {
   private:
-    CoalesceEngine* owner;
     Addr memoryAtomSize;
     int atomBlockSize;
     size_t elementSize;
@@ -88,7 +104,6 @@ class WorkDirectory
     void deactivate(Addr addr);
     int workCount();
     std::tuple<WorkLocation, Addr> getNextWork();
-
 };
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -100,47 +115,54 @@ class CoalesceEngine : public BaseMemoryEngine
         Addr addr;
         uint64_t busyMask;
         bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
+        bool dirty;
+        bool hasConflict;
+        bool needsPreWBApply;
         CacheState state;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
+        Tick lastChangedTick;
         Block() {}
         Block(int num_elements):
           addr(-1),
           busyMask(0),
           valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-          state(CacheState::INVALID)
+          dirty(false),
+          hasConflict(false),
+          needsPreWBApply(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
 
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            needsPreWBApply = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
+                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
+                "state: %s, lastChangedTick: %lu}", addr, busyMask,
+                valid ? "true" : "false", dirty ? "true" : "false",
+                hasConflict ? "true" : "false",
+                needsPreWBApply ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
         }
     };
 
-    struct SenderState : public Packet::SenderState
+    struct ReadPurpose : public Packet::SenderState
     {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
     };
+
     MPU* owner;
     GraphWorkload* graphWorkload;
 
@@ -150,28 +172,33 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int onTheFlyReqs;
     int numMSHREntries;
-    int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    int _workCount;
+    // Tracking work in cache
+    int cacheWorkCount;
     int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
+    UniqueFIFO<int> preWBApplyQueue;
+    // NOTE: Remember to erase from this upon eviction from cache
+    UniqueFIFO<int> activeCacheBlocks;
+
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    int activeBufferSize;
     int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
+    // TODO: Should be moved to WorkDirectory
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -188,6 +215,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPreWBApplyEvent;
     void processNextPreWBApplyEvent();
 
+    EventFunctionWrapper nextPrePushApplyEvent;
+    void processNextPrePushApplyEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
@@ -223,7 +253,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
@@ -246,6 +275,8 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
+    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
+    // workcount.
     int workCount() { return _workCount; }
     void recvVertexPull();
 
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
new file mode 100644
index 0000000000..7a064c1c2f
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.cc
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    needsPush.reset();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        // TODO: Add and implement init function for GraphWorkload.
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        memPort.sendFunctional(pkt);
+    }
+}
+
+bool
+CoalesceEngine::done()
+{
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    return atom_index * block_bits;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
+}
+
+bool
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
+        // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextResponseEvent for latency cycles in
+        // the future.
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.mshrTargetShortage++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return true;
+    } else {
+        // miss
+        // FIXME: Make this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
+            if (MSHR.size() == numMSHREntries) {
+                // Out of MSHR entries
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+                                "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
+                stats.mshrEntryShortage++;
+                return false;
+            } else {
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    stats.readMisses++;
+                    // TODO: Add readConflicts here.
+                    stats.numVertexReads++;
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[block_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
+
+                    cacheBlocks[block_index].addr = aligned_addr;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                    stats.readMisses++;
+                    stats.numVertexReads++;
+                    return true;
+                }
+            }
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
+                stats.mshrTargetShortage++;
+                return false;
+            }
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
+
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
+
+            MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
+            return true;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+        return true;
+    }
+
+    onTheFlyReqs--;
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
+
+    bool do_wb = false;
+    if (pkt->findNextSenderState<SenderState>()) {
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
+        int it = getBitIndexBase(addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
+        // No applying of the line needed.
+        for (int i = 0; i < numElementsPerLine; i++) {
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
+                needsPush[it + i] = 0;
+                _workCount--;
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    // TODO: Add a stat to count this.
+                    owner->recvPrevPullCorrection();
+                }
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pendingVertexPullReads.erase(addr);
+        maxPotentialPostPushWB--;
+    }
+
+    bool cache_wb = false;
+    if (cacheBlocks[block_index].addr == addr) {
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+        assert(MSHR.find(block_index) != MSHR.end());
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
+        }
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
+        cacheBlocks[block_index].pendingData = false;
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
+    }
+
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
+        }
+    }
+
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+
+
+    // TODO: Probably check for done here too.
+    delete pkt;
+    return true;
+}
+
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
+        stats.numVertexWrites++;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
+        cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    // TODO: Make this more general and programmable.
+    if ((cacheBlocks[block_index].busyMask == 0)) {
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
+            }
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+}
+
+void
+CoalesceEngine::processNextPreWBApplyEvent()
+{
+    int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    needsPush[bit_index_base + index] = 1;
+                    _workCount++;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
+                    }
+                }
+            }
+        }
+        stats.bitvectorLength.sample(needsPush.count());
+
+        assert(cacheBlocks[block_index].needsWB);
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+
+        assert(MSHR.size() <= numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
+        }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
+    }
+
+    applyQueue.pop_front();
+    if ((!applyQueue.empty()) &&
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
+    }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
+        pendingVertexPullReads.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
+        Addr miss_addr = MSHR[block_index].front();
+        Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+        stats.numInvalidWriteBacks++;
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
+std::tuple<WorkLocation, Addr, int>
+CoalesceEngine::getOptimalPullAddr()
+{
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                WorkLocation::PENDING_READ, addr, index_offset);
+        } else {
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_CACHE, block_index, index_offset);
+            // Otherwise if it is in memory
+            } else if ((cacheBlocks[block_index].addr != addr)) {
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_MEMORY, addr, index_offset);
+            }
+        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
+    }
+
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    WorkLocation bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_MEMORY) {
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
+        }
+    }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
+    if (numPullsReceived > 0) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    bool should_schedule = (numPullsReceived == 0);
+    numPullsReceived++;
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
new file mode 100644
index 0000000000..0787a334c1
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.hh
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include <bitset>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+
+
+namespace gem5
+{
+
+enum WorkLocation
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE,
+    NUM_STATUS
+};
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+        Tick lastChangedTick;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          lastChangedTick(0),
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
+        }
+    };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    int onTheFlyReqs;
+    int numMSHREntries;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    int _workCount;
+    int numPullsReceived;
+    UniqueFIFO<int> applyQueue;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
+
+    int maxPotentialPostPushWB;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine &coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
+
+        statistics::Vector bitvectorSearchStatus;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memoryFunctionLatency;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt);
+
+    bool recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return _workCount; }
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__

From ebe3fc165e9b3c8a9ec0a29feb64e3a8b0e798b7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 00:05:27 -0800
Subject: [PATCH 211/279] First working and tested version of workdirectory.

---
 configs/accl/bfs.py                        |    1 +
 configs/accl/sega.py                       |    6 +-
 src/accl/graph/base/data_structs.hh        |   23 +-
 src/accl/graph/base/graph_workload.cc      |  236 ++--
 src/accl/graph/base/graph_workload.hh      |   67 +-
 src/accl/graph/sega/CenteralController.py  |    2 +-
 src/accl/graph/sega/CoalesceEngine.py      |    7 +-
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 -
 src/accl/graph/sega/SConscript             |    5 +-
 src/accl/graph/sega/centeral_controller.cc |   14 +-
 src/accl/graph/sega/centeral_controller.hh |    3 +-
 src/accl/graph/sega/coalesce_engine.cc     |  932 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  117 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 --------------------
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ----
 src/accl/graph/sega/enums.cc               |   57 +
 src/accl/graph/sega/enums.hh               |   66 +
 src/accl/graph/sega/mpu.cc                 |    6 -
 src/accl/graph/sega/mpu.hh                 |    6 +-
 src/accl/graph/sega/push_engine.cc         |   37 +-
 src/accl/graph/sega/push_engine.hh         |    2 +-
 src/accl/graph/sega/wl_engine.cc           |   28 +-
 src/accl/graph/sega/wl_engine.hh           |    1 +
 src/accl/graph/sega/work_directory.hh      |  212 ++++
 src/mem/mem_ctrl.cc                        |    2 +-
 25 files changed, 1030 insertions(+), 2376 deletions(-)
 delete mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh
 create mode 100644 src/accl/graph/sega/enums.cc
 create mode 100644 src/accl/graph/sega/enums.hh
 create mode 100644 src/accl/graph/sega/work_directory.hh

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index fc32b96642..a201acd4d1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -68,6 +68,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.create_pop_count_directory(256)
     system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0f4b133791..54f22b1377 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -54,8 +54,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             num_mshr_entry=64,
-            num_tgts_per_mshr=64,
             max_resp_per_cycle=8,
+            active_buffer_size = 64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -139,6 +139,10 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 070e635736..84233ae39c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -36,8 +36,6 @@
 #include <cassert>
 #include <list>
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
-
 namespace gem5
 {
 
@@ -45,33 +43,28 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
+    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
-    uint32_t degree : 31;
-    bool active: 1;
 
     std::string to_string()
     {
-        return csprintf(
-                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
-                active ? "true" : "false");
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                            "degree: %u}", tempProp, prop, edgeIndex, degree);
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
-        edgeIndex(0),
         degree(0),
-        active(false)
+        edgeIndex(0)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t edge_index, uint32_t degree, bool active):
+                uint32_t degree, uint32_t edge_index):
         tempProp(temp_prop),
         prop(prop),
-        edgeIndex(edge_index),
         degree(degree),
-        active(active)
+        edgeIndex(edge_index)
     {}
 
 };
@@ -111,8 +104,8 @@ struct MetaEdge {
 
     std::string to_string()
     {
-        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
-                                                    src, dst, weight);
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
     }
 };
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 07accff44f..446509201f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -56,39 +56,27 @@ readFromFloat(float value)
     return float_bits;
 }
 
-BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
-    GraphWorkload(), initValue(init_value), atomSize(atom_size)
-{
-    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
-    initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-
 void
-BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
-    if (pkt->getAddr() == initAddrBase) {
-        WorkListItem items[numElementsPerLine];
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
 
-        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
 
-        items[initIndex].tempProp = initValue;
-        items[initIndex].prop = initValue;
-        if (items[initIndex].degree > 0) {
-            needsPush[bit_index_base + initIndex] = 1;
-            activeBits.push_back(bit_index_base + initIndex);
-            _workCount++;
-        }
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
 
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
         pkt->deleteData();
         pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
     }
-
 }
 
 uint32_t
@@ -104,28 +92,16 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::applyCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem wl)
 {
-    return wl.tempProp < wl.prop;
-}
-
-bool
-BFSWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        wl.prop = wl.tempProp;
-        if (wl.degree > 0) {
-            return true;
-        }
-    }
-    return false;
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
 }
 
-std::tuple<uint32_t, bool, bool>
-BFSWorkload::prePushApply(WorkListItem& wl)
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
 {
-    uint32_t value = wl.prop;
-    return std::make_tuple(value, true, false);
+    wl.prop = wl.tempProp;
+    return wl.prop;
 }
 
 std::string
@@ -137,92 +113,92 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-{
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-void
-PRWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
-{
-    WorkListItem items[numElementsPerLine];
-
-    pkt->writeDataToBlock((uint8_t*) items, atomSize);
-    for (int i = 0; i < numElementsPerLine; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        if (items[i].degree > 0) {
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-            _workCount++;
-        }
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, atomSize);
-}
-
-uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
-{
-    float update_float = writeToFloat<uint32_t>(update);
-    float value_float = writeToFloat<uint32_t>(value);
-    return readFromFloat<uint32_t>(update_float + value_float);
-}
-
-uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = 1.0;
-
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-}
-
-bool
-PRWorkload::applyCondition(WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
-}
-
-bool
-PRWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl) && (wl.degree > 0)) {
-        return true;
-    }
-    return false;
-}
-
-std::tuple<uint32_t, bool, bool>
-PRWorkload::prePushApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-        float prop_float = writeToFloat<uint32_t>(wl.prop);
-        float delta = (temp_float - prop_float) / wl.degree;
-        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-        wl.prop = wl.tempProp;
-        return std::make_tuple(delta_uint, true, true);
-    }
-    return std::make_tuple(0, false, false);
-}
-
-std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, temp_float, wl.degree, wl.edgeIndex
-            );
-}
+// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+// {
+//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+// }
+
+// void
+// PRWorkload::init(PacketPtr pkt, int bit_index_base,
+//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+//                 std::deque<int>& activeBits,
+//                 int& _workCount)
+// {
+//     WorkListItem items[numElementsPerLine];
+
+//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
+//     for (int i = 0; i < numElementsPerLine; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         if (items[i].degree > 0) {
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//             _workCount++;
+//         }
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = 1.0;
+
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::applyCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return dist >= threshold;
+// }
+
+// bool
+// PRWorkload::preWBApply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl) && (wl.degree > 0)) {
+//         return true;
+//     }
+//     return false;
+// }
+
+// std::tuple<uint32_t, bool, bool>
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl)) {
+//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//         float prop_float = writeToFloat<uint32_t>(wl.prop);
+//         float delta = (temp_float - prop_float) / wl.degree;
+//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//         wl.prop = wl.tempProp;
+//         return std::make_tuple(delta_uint, true, true);
+//     }
+//     return std::make_tuple(0, false, false);
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, temp_float, wl.degree, wl.edgeIndex
+//             );
+// }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 6bbc4935c2..f71955bd16 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,6 +34,7 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "mem/packet.hh"
 
 
@@ -46,70 +47,54 @@ class GraphWorkload
     GraphWorkload() {}
     ~GraphWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount) = 0;
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
-    virtual bool applyCondition(WorkListItem wl) = 0;
-    virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddrBase;
-    int initIndex;
+    uint64_t initAddr;
     uint32_t initValue;
-    int numElementsPerLine;
-    int atomSize;
 
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
 
     ~BFSWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-    int numElementsPerLine;
-    int atomSize;
-
-  public:
-    PRWorkload(float alpha, float threshold, int atom_size);
+//   public:
+//     PRWorkload(float alpha, float threshold);
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..0c21833a05 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 8ec9214b49..a447dedc3d 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -40,9 +41,13 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
-
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
     post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
+    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
deleted file mode 100644
index 1fd3b968c5..0000000000
--- a/src/accl/graph/sega/CoalesceEngine_bak.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseMemoryEngine import BaseMemoryEngine
-
-class CoalesceEngine(BaseMemoryEngine):
-    type = 'CoalesceEngine'
-    cxx_header = "accl/graph/sega/coalesce_engine.hh"
-    cxx_class = 'gem5::CoalesceEngine'
-
-    cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
-    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
-                                "requestor in each cycle. Used to limit b/w.")
-
-    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
-                                "apply process for applications that require "
-                                "the apply process to happen exactly before "
-                                "pushing the edgePointer to the PushEngine.")
-
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d411be9ac..b3e1a838fb 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -37,6 +37,7 @@ SimObject("WLEngine.py", sim_objects=["WLEngine"])
 Source("base_memory_engine.cc")
 Source("centeral_controller.cc")
 Source("coalesce_engine.cc")
+Source("enums.cc")
 Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
@@ -45,10 +46,10 @@ DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
-DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
 DebugFlag("WLEngine")
 
 CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
-                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file
+                    "WLEngine", "BaseMemoryEngine"])
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fc2262e111..883992e64e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,6 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
@@ -106,14 +107,14 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
+    workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
+// }
 
 void
 CenteralController::recvDoneSignal()
@@ -144,6 +145,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
+            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 9ddb1b35f0..6eb07dbcac 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -35,7 +35,6 @@
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
-#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -64,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66ff66c068..0aa61345f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -42,26 +43,23 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
+    BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
-    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
+    activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
-    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    nextPrePushApplyEvent([this] {
-        processNextPrePushApplyEvent();
-        }, name() + ".nextPrePushApplyEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,6 +67,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    activeBuffer.clear();
+    postPushWBQueue.clear();
 }
 
 void
@@ -85,7 +85,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // TODO: Check postPushWBQueue for hits
+        // FIXME: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -97,54 +97,70 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        // FIXME: Pass workdirectory to graphworkload.init
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        graphWorkload->init(pkt, directory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
         memPort.sendFunctional(pkt);
     }
 }
 
+void
+CoalesceEngine::postMemInitSetup()
+{
+    directory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+{
+    directory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
 bool
 CoalesceEngine::done()
 {
-    // FIXME: Fix this later
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
+        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
 }
 
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
+bool
+CoalesceEngine::timeToPull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
 }
 
-// FIXME: This and the next function should be moved to the
-// WorkDirectory.
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
+bool
+CoalesceEngine::canSchedulePull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
+    // TODO: Maybe a good idea to change this to
+    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
+    return pullsScheduled < 1;
 }
 
-// FIXME: Read FIXME: Above
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
+bool
+CoalesceEngine::workLeftInMem()
 {
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
+    return !directory->empty();
 }
 
 bool
+CoalesceEngine::pullCondition()
+{
+    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
 CoalesceEngine::recvWLRead(Addr addr)
 {
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
@@ -163,6 +179,9 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
+            return ReadReturnStatus::REJECT_NO_ROLL;
+        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
@@ -197,7 +216,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
                 (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
@@ -207,7 +226,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
@@ -217,7 +235,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
@@ -232,20 +250,37 @@ CoalesceEngine::recvWLRead(Addr addr)
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 if (cacheBlocks[block_index].dirty) {
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
                             processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
                 } else {
-                    // NOTE: move the cache block to invalid state
-                    // FIXME: Fix the issue below.
-                    // May need to activate tracking for this
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    bool atom_active = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                    }
+                    if (atom_active) {
+                        activeCacheBlocks.erase(block_index);
+                        directory->activate(cacheBlocks[block_index].addr);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
                     cacheBlocks[block_index].reset();
                 }
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                return ReadReturnStatus::REJECT_ROLL;
             }
-            // return int instead of bool to tell WLEngine to whether
-            // roll the first entry in the queue.
-            return false;
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
@@ -255,16 +290,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].dirty = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].needsPreWBApply = false;
                 cacheBlocks[block_index].state = CacheState::PENDING_DATA;
                 cacheBlocks[block_index].lastChangedTick = curTick();
+
+                MSHR[block_index].push_back(addr);
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextRead(block_index, schedule_tick);
                     }, block_index, curTick());
-                return true;
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                return ReadReturnStatus::ACCEPT;
             } else {
-                return false;
+                return ReadReturnStatus::REJECT_ROLL;
             }
         }
     }
@@ -276,116 +316,87 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
                                                 __func__, pkt->print());
+
+    onTheFlyReqs--;
     if (pkt->isWrite()) {
         DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
         }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
 
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(
+                                            cacheBlocks[block_index].items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeCacheBlocks.push_back(block_index);
+            }
 
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
-            if (aligned_miss_addr == addr) {
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
                 int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
                             "cacheBlocks[%d] can be serviced with the received "
                             "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
                 responseQueue.push_back(std::make_tuple(miss_addr,
                         cacheBlocks[block_index].items[wl_offset], curTick()));
                 DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
@@ -400,32 +411,72 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             graphWorkload->printWorkListItem(
                                 cacheBlocks[block_index].items[wl_offset]),
                             responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
                 cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
                 DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                             block_index, cacheBlocks[block_index].to_string());
                 it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeBuffer.emplace_back(pkt, curTick());
+                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
             } else {
-                it++;
+                delete pkt;
+            }
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     if ((!nextMemoryEvent.pending()) &&
+            //         (!nextMemoryEvent.scheduled())) {
+            //         schedule(nextMemoryEvent, nextCycle());
+            //     }
+            //     pullsScheduled++;
+            // }
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
             }
         }
     }
 
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    delete pkt;
     return true;
 }
 
-// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextResponseEvent()
 {
@@ -450,8 +501,8 @@ CoalesceEngine::processNextResponseEvent()
                     addr_response);
 
         responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
@@ -491,27 +542,28 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
                 "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
+
+    // NOTE: Design does not allow for write misses.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
-    assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
 
     // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
+        cacheBlocks[block_index].dirty |= true;
     }
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
+    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
+        (!activeCacheBlocks.find(block_index))) {
+        activeCacheBlocks.push_back(block_index);
+        if (!owner->running()) {
+            owner->start();
+        }
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
@@ -523,188 +575,40 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = preWBApplyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-
-    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].needsPreWBApply);
-        bool block_active = false;
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            block_active |= active;
-            if (active) {
-                // cacheWorkCount++;
-                // FUTUREME: When pulling from activeCacheBlocks, in case we
-                // face a block that is not in idle state, we basically pop
-                // that entry and push it to the back. We only delete entries
-                // in this buffer if pushed or evicted.
-                activeCacheBlocks.push_back(block_index);
-            }
-        }
-        if (block_active && !owner->running()) {
-            owner->start();
-        }
-
-        cacheBlocks[block_index].needsPreWBApply = false;
+    if (cacheBlocks[block_index].busyMask == 0) {
         if (cacheBlocks[block_index].hasConflict) {
             if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextWriteBack(block_index, schedule_tick);
                     }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
             } else {
-                // FIXME: Solve below issue.
-                // Not dirty but could be active still.
-                // need to activate tracking
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                }
+                if (atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                    directory->activate(cacheBlocks[block_index].addr);
+                }
                 cacheBlocks[block_index].reset();
             }
         } else {
             cacheBlocks[block_index].state = CacheState::IDLE;
-        }
-        cacheBlocks[block_index].lastChangedTick = curTick();
-    } else {
-
-    }
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
         }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
     }
-
-    if (done()) {
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
 }
@@ -740,6 +644,10 @@ CoalesceEngine::processNextMemoryEvent()
     if ((!memoryFunctionQueue.empty())) {
         schedule(nextMemoryEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -759,36 +667,68 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].valid);
     assert(!cacheBlocks[block_index].dirty);
-    assert(!cacheBlocks[block_index].needsPreWBApply);
     assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
 
     // NOTE: Search postPushWBQueue
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
             need_send_pkt = false;
-            postPushWBQueue.erase(wb);
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, postPushWBQueue.size());
+        } else {
+            wb++;
         }
     }
-    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
         PacketPtr ab_pkt = std::get<0>(*ab);
-        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
             ab_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            activeCacheBlocks.push_back(block_index);
+
             need_send_pkt = false;
-            activeBuffer.erase(ab);
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     pullsScheduled++;
+            // }
+            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, activeBuffer.size());
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
         }
     }
     if (!need_send_pkt) {
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsPreWBApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
         for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
             Addr miss_addr = *it;
             Addr aligned_miss_addr =
@@ -828,14 +768,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         cacheBlocks[block_index].state = CacheState::BUSY;
     }
 
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-                                                pendingVertexPullReads.end()) {
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
         need_send_pkt = false;
     }
 
     if (need_send_pkt) {
         PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                         peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
         DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
@@ -852,25 +794,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 block_index, cacheBlocks[block_index].to_string());
 
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].dirty);
         assert(cacheBlocks[block_index].hasConflict);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
         assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
-        Addr base_addr = cacheBlocks[block_index].addr;
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (cacheBlocks[block_index].items[index].active) {
-                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
-                // NOTE: Implement this
-                // workdir.activate()
-                // cacheWorkCount--;
-            }
+            atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
         }
-        if (activeCacheBlocks.find(block_index)) {
+        if (atom_active) {
             activeCacheBlocks.erase(block_index);
+            directory->activate(cacheBlocks[block_index].addr);
         }
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -878,9 +819,8 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        onTheFlyReqs++;
         cacheBlocks[block_index].reset();
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
     } else {
@@ -896,94 +836,54 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
+    if (postPushWBQueue.empty()) {
+        return;
+    }
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
     if (schedule_tick == pkt_tick) {
         memPort.sendPacket(wb_pkt);
+        onTheFlyReqs++;
         postPushWBQueue.pop_front();
+        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
+    pullsScheduled--;
+    if (!directory->empty()) {
+        Addr addr = directory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
         }
     }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
 }
 
 void
@@ -1000,26 +900,149 @@ CoalesceEngine::recvMemRetry()
     schedule(nextMemoryEvent, nextCycle());
 }
 
+int
+CoalesceEngine::workCount()
+{
+    return activeCacheBlocks.size() +
+            directory->workCount() + activeBuffer.size();
+}
+
 void
 CoalesceEngine::recvVertexPull()
 {
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
 
     stats.verticesPulled++;
     stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (graphWorkload->activeCondition(items[index])) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active |= graphWorkload->activeCondition(items[index]);
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
+                            "postPushWBQueue.size: %d.\n", __func__,
+                            wb_pkt->print(), postPushWBQueue.size());
+            activeBuffer.pop_front();
+            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!activeCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = activeCacheBlocks.size();
+        while (true) {
+            int block_index = activeCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                    }
+                }
+
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            activeCacheBlocks.pop_front();
+            activeCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find "
+                        "work to apply.\n", __func__);
+    }
+
+    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+    //     memoryFunctionQueue.emplace_back(
+    //         [this] (int ignore, Tick schedule_tick) {
+    //             processNextVertexPull(ignore, schedule_tick);
+    //         }, 0, curTick());
+    //     if ((!nextMemoryEvent.pending()) &&
+    //         (!nextMemoryEvent.scheduled())) {
+    //         schedule(nextMemoryEvent, nextCycle());
+    //     }
+    //     pullsScheduled++;
+    // }
+    if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
         }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
@@ -1036,16 +1059,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1054,13 +1072,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
     ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1083,12 +1096,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
 
@@ -1096,7 +1103,6 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8da67c7b43..c457b214f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,83 +29,20 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include <bitset>
-
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-
-
 namespace gem5
 {
 
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-enum CacheState
-{
-    INVALID,
-    PENDING_DATA,
-    BUSY,
-    IDLE,
-    PENDING_PRE_WB_APPLY,
-    PENDING_WB,
-    PENDING_PRE_PUSH_APPLY,
-    NUM_CACHE_STATE
-};
-
-const char* cacheStateStrings[NUM_CACHE_STATE] = {
-    "INVALID",
-    "PENDING_DATA",
-    "BUSY",
-    "IDLE",
-    "PENDING_PRE_WB_APPLY",
-    "PENDING_WB",
-    "PENDING_PRE_PUSH_APPLY"
-};
-
-enum ReadDestination
-{
-    READ_FOR_CACHE,
-    READ_FOR_PUSH
-};
-
 class MPU;
 
-
-// TODO: Add active bit to WorkListItem class. Check active bit before activate
-// Only activate if necessary and not active before.
-class WorkDirectory
-{
-  private:
-    Addr memoryAtomSize;
-    int atomBlockSize;
-    size_t elementSize;
-
-    int _workCount;
-  public:
-    AddrRange memoryRange;
-    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
-        memoryAtomSize(atom_size), atomBlockSize(block_size),
-        elementSize(element_size), _workCount(0)
-    {}
-
-    void activate(Addr addr);
-    void deactivate(Addr addr);
-    int workCount();
-    std::tuple<WorkLocation, Addr> getNextWork();
-};
-
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -117,7 +54,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool valid;
         bool dirty;
         bool hasConflict;
-        bool needsPreWBApply;
         CacheState state;
         Tick lastChangedTick;
         Block() {}
@@ -127,7 +63,6 @@ class CoalesceEngine : public BaseMemoryEngine
           valid(false),
           dirty(false),
           hasConflict(false),
-          needsPreWBApply(false),
           state(CacheState::INVALID),
           lastChangedTick(0)
         {
@@ -140,18 +75,15 @@ class CoalesceEngine : public BaseMemoryEngine
             valid = false;
             dirty = false;
             hasConflict = false;
-            needsPreWBApply = false;
             state = CacheState::INVALID;
             lastChangedTick = 0;
         }
 
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
-                "state: %s, lastChangedTick: %lu}", addr, busyMask,
-                valid ? "true" : "false", dirty ? "true" : "false",
-                hasConflict ? "true" : "false",
-                needsPreWBApply ? "true" : "false",
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
                 cacheStateStrings[state], lastChangedTick);
         }
     };
@@ -164,8 +96,11 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
+    WorkDirectory* directory;
     GraphWorkload* graphWorkload;
 
+    Addr lastAtomAddr;
+
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
@@ -179,26 +114,26 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     // Tracking work in cache
-    int cacheWorkCount;
-    int numPullsReceived;
-    UniqueFIFO<int> preWBApplyQueue;
+    int pullsReceived;
     // NOTE: Remember to erase from this upon eviction from cache
     UniqueFIFO<int> activeCacheBlocks;
 
+    int pullsScheduled;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+    std::unordered_set<Addr> pendingPullAddrs;
 
     int activeBufferSize;
     int postPushWBQueueSize;
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
+    bool timeToPull();
+    bool canSchedulePull();
+    bool workLeftInMem();
+    bool pullCondition();
     int getBlockIndex(Addr addr);
-    // TODO: Should be moved to WorkDirectory
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -212,11 +147,8 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    EventFunctionWrapper nextPrePushApplyEvent;
-    void processNextPrePushApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -236,19 +168,14 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
         statistics::Scalar numInvalidWriteBacks;
 
-        statistics::Vector bitvectorSearchStatus;
-
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
@@ -272,12 +199,14 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
-    bool recvWLRead(Addr addr);
+    void postMemInitSetup();
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
-    // workcount.
-    int workCount() { return _workCount; }
+    int workCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
deleted file mode 100644
index 7a064c1c2f..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.cc
+++ /dev/null
@@ -1,1308 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/coalesce_engine.hh"
-
-#include <bitset>
-
-#include "accl/graph/sega/mpu.hh"
-#include "base/intmath.hh"
-#include "debug/CacheBlockState.hh"
-#include "debug/CoalesceEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-#include "mem/packet_access.hh"
-#include "sim/sim_exit.hh"
-
-namespace gem5
-{
-
-CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
-    numLines((int) (params.cache_size / peerMemoryAtomSize)),
-    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
-    nextMemoryEvent([this] {
-        processNextMemoryEvent();
-        }, name() + ".nextMemoryEvent"),
-    nextResponseEvent([this] {
-        processNextResponseEvent();
-        }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    stats(*this)
-{
-    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
-    cacheBlocks = new Block [numLines];
-    for (int i = 0; i < numLines; i++) {
-        cacheBlocks[i] = Block(numElementsPerLine);
-    }
-    needsPush.reset();
-}
-
-void
-CoalesceEngine::registerMPU(MPU* mpu)
-{
-    owner = mpu;
-}
-
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->isRead()) {
-        assert(pkt->getSize() == peerMemoryAtomSize);
-        Addr addr = pkt->getAddr();
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
-
-            pkt->makeResponse();
-            pkt->setDataFromBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-        } else {
-            memPort.sendFunctional(pkt);
-        }
-    } else {
-        // TODO: Add and implement init function for GraphWorkload.
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-CoalesceEngine::done()
-{
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
-}
-
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
-{
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
-}
-
-bool
-CoalesceEngine::recvWLRead(Addr addr)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = getBlockIndex(aligned_addr);
-    assert(block_index < numLines);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    assert(wl_offset < numElementsPerLine);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-                        "This request maps to cacheBlocks[%d], aligned_addr: "
-                        "%lu, and wl_offset: %d.\n", __func__, addr,
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    if ((cacheBlocks[block_index].addr == aligned_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-        stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
-        responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
-
-        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                        cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        // TODO: Stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        // HACK: If a read happens on the same cycle as another operation such
-        // as apply set lastChangedTick to half a cycle later so that operation
-        // scheduled by the original operation (apply in this example) are
-        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
-        cacheBlocks[block_index].lastChangedTick =
-                                    curTick() + (Tick) (clockPeriod() / 2);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-
-        if (!nextResponseEvent.scheduled()) {
-            schedule(nextResponseEvent, nextCycle());
-        }
-        stats.numVertexReads++;
-        return true;
-    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
-        // Hit under miss
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
-                                                        __func__, addr);
-        stats.readHitUnderMisses++;
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
-        MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                "for cacheBlocks[%d].\n", __func__, addr, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-        stats.numVertexReads++;
-        return true;
-    } else {
-        // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
-        assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
-                }
-            }
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
-                return false;
-            }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
-        }
-    }
-}
-
-bool
-CoalesceEngine::handleMemResp(PacketPtr pkt)
-{
-    assert(pkt->isResponse());
-    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
-                                                __func__, pkt->print());
-    if (pkt->isWrite()) {
-        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
-        delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
-        }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
-
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
-
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-            if (aligned_miss_addr == addr) {
-                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                            "cacheBlocks[%d] can be serviced with the received "
-                            "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
-                responseQueue.push_back(std::make_tuple(miss_addr,
-                        cacheBlocks[block_index].items[wl_offset], curTick()));
-                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, miss_addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
-                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
-                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                            block_index, cacheBlocks[block_index].to_string());
-                it = MSHR[block_index].erase(it);
-            } else {
-                it++;
-            }
-        }
-    }
-
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-
-
-    // TODO: Probably check for done here too.
-    delete pkt;
-    return true;
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextResponseEvent()
-{
-    int num_responses_sent = 0;
-
-    Addr addr_response;
-    WorkListItem worklist_response;
-    Tick response_queueing_tick;
-    while(true) {
-        std::tie(addr_response, worklist_response, response_queueing_tick) =
-                                                        responseQueue.front();
-        Tick waiting_ticks = curTick() - response_queueing_tick;
-        if (ticksToCycles(waiting_ticks) < 1) {
-            break;
-        }
-        owner->handleIncomingWL(addr_response, worklist_response);
-        num_responses_sent++;
-        DPRINTF(CoalesceEngine,
-                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__,
-                    graphWorkload->printWorkListItem(worklist_response),
-                    addr_response);
-
-        responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        stats.responseQueueLatency.sample(
-                                    waiting_ticks * 1e9 / getClockFrequency());
-        if (num_responses_sent >= maxRespPerCycle) {
-            if (!responseQueue.empty()) {
-                stats.responsePortShortage++;
-            }
-            break;
-        }
-        if (responseQueue.empty()) {
-            break;
-        }
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    int block_index = getBlockIndex(aligned_addr);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
-                        "wl: %s. This request maps to cacheBlocks[%d], "
-                        "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, graphWorkload->printWorkListItem(wl),
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__,
-                graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
-    assert(cacheBlocks[block_index].addr == aligned_addr);
-    // cache state asserts
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    // respective bit in busyMask for wl is set.
-    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
-            (1 << wl_offset));
-
-    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
-    }
-    cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
-    }
-
-    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    cacheBlocks[block_index].lastChangedTick = curTick();
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, wl_offset,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]));
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
-        }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
-    }
-
-    if (done()) {
-        owner->recvDoneSignal();
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        stats.numMemoryBlocks++;
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    DPRINTF(CoalesceEngine, "%s: Processing another "
-                        "memory function.\n", __func__);
-    std::function<void(int, Tick)> next_memory_function;
-    int next_memory_function_input;
-    Tick next_memory_function_tick;
-    std::tie(
-        next_memory_function,
-        next_memory_function_input,
-        next_memory_function_tick) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input, next_memory_function_tick);
-    memoryFunctionQueue.pop_front();
-    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
-                                                * 1e9 / getClockFrequency());
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
-                                            __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-        __func__, block_index, cacheBlocks[block_index].to_string());
-    // A cache block should not be touched while it's waiting for data.
-    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
-    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
-        return;
-    }
-
-    assert(!cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    bool need_send_pkt = true;
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
-    {
-        PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
-            wb_pkt->writeDataToBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
-            postPushWBQueue.erase(wb);
-            need_send_pkt = false;
-        }
-    }
-
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
-        need_send_pkt = false;
-    }
-
-    if (need_send_pkt) {
-        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                        peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
-    }
-}
-
-void
-CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
-                                                __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-
-        PacketPtr pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
-                        "Addr: %lu, size = %d.\n", __func__,
-                        pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
-                            "write back has been scheduled for it. Ignoring "
-                            "the current write back scheduled at tick %lu for "
-                            "the right function scheduled later.\n",
-                            __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
-    }
-}
-
-void
-CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
-{
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        postPushWBQueue.pop_front();
-    }
-}
-
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
-void
-CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
-{
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
-        }
-    }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-void
-CoalesceEngine::recvVertexPull()
-{
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
-
-    stats.verticesPulled++;
-    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    }
-}
-
-CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
-    : statistics::Group(&_coalesce),
-    coalesce(_coalesce),
-    lastResetTick(0),
-    ADD_STAT(numVertexReads, statistics::units::Count::get(),
-             "Number of memory vertecies read from cache."),
-    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
-             "Number of memory vertecies written to cache."),
-    ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits."),
-    ADD_STAT(readMisses, statistics::units::Count::get(),
-             "Number of cache misses."),
-    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
-             "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
-    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
-             "Number of times a response has been "
-             "delayed because of port shortage. "),
-    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
-             "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
-    ADD_STAT(verticesPulled, statistics::units::Count::get(),
-             "Number of times a pull request has been sent by PushEngine."),
-    ADD_STAT(verticesPushed, statistics::units::Count::get(),
-             "Number of times a vertex has been pushed to the PushEngine"),
-    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
-             "Time of the last pull request. (Relative to reset_stats)"),
-    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
-             "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
-    ADD_STAT(hitRate, statistics::units::Ratio::get(),
-             "Hit rate in the cache."),
-    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which pull requests arrive."),
-    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
-             "Histogram of the response latency to WLEngine. (ns)"),
-    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
-             "Histogram of the latency of processing a memory function.")
-{
-}
-
-void
-CoalesceEngine::CoalesceStats::regStats()
-{
-    using namespace statistics;
-
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
-    hitRate = (readHits + readHitUnderMisses) /
-                (readHits + readHitUnderMisses + readMisses);
-
-    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
-
-    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
-
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
-    responseQueueLatency.init(64);
-    memoryFunctionLatency.init(64);
-}
-
-void
-CoalesceEngine::CoalesceStats::resetStats()
-{
-    statistics::Group::resetStats();
-
-    lastResetTick = curTick();
-}
-
-} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
deleted file mode 100644
index 0787a334c1..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.hh
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-
-#include <bitset>
-
-#include "accl/graph/base/data_structs.hh"
-#include "accl/graph/base/graph_workload.hh"
-#include "accl/graph/sega/base_memory_engine.hh"
-#include "base/cprintf.hh"
-#include "base/statistics.hh"
-#include "params/CoalesceEngine.hh"
-
-
-
-namespace gem5
-{
-
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-class MPU;
-
-class CoalesceEngine : public BaseMemoryEngine
-{
-  private:
-    struct Block
-    {
-        WorkListItem* items;
-        Addr addr;
-        uint64_t busyMask;
-        bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
-        Block() {}
-        Block(int num_elements):
-          addr(-1),
-          busyMask(0),
-          valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-        {
-          items = new WorkListItem [num_elements];
-        }
-
-        std::string to_string() {
-            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
-        }
-    };
-
-    struct SenderState : public Packet::SenderState
-    {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
-    };
-    MPU* owner;
-    GraphWorkload* graphWorkload;
-
-    int numLines;
-    int numElementsPerLine;
-    Block* cacheBlocks;
-
-    int onTheFlyReqs;
-    int numMSHREntries;
-    int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHR;
-    int maxRespPerCycle;
-    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
-
-    int _workCount;
-    int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
-    int postPushWBQueueSize;
-    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
-
-    int getBlockIndex(Addr addr);
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
-
-    MemoryEvent nextMemoryEvent;
-    void processNextMemoryEvent();
-    void processNextRead(int block_index, Tick schedule_tick);
-    void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int ignore, Tick schedule_tick);
-    void processNextPostPushWB(int ignore, Tick schedule_tick);
-    std::deque<std::tuple<
-        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
-
-    EventFunctionWrapper nextResponseEvent;
-    void processNextResponseEvent();
-
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    struct CoalesceStats : public statistics::Group
-    {
-        CoalesceStats(CoalesceEngine &coalesce);
-
-        virtual void regStats() override;
-
-        virtual void resetStats() override;
-
-        CoalesceEngine &coalesce;
-
-        Tick lastResetTick;
-
-        statistics::Scalar numVertexReads;
-        statistics::Scalar numVertexWrites;
-        statistics::Scalar readHits;
-        statistics::Scalar readMisses;
-        statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
-        statistics::Scalar responsePortShortage;
-        statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
-        statistics::Scalar verticesPulled;
-        statistics::Scalar verticesPushed;
-        statistics::Scalar lastVertexPullTime;
-        statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
-        statistics::Scalar numInvalidWriteBacks;
-
-        statistics::Vector bitvectorSearchStatus;
-
-        statistics::Formula hitRate;
-        statistics::Formula vertexPullBW;
-        statistics::Formula vertexPushBW;
-
-        statistics::Histogram mshrEntryLength;
-        statistics::Histogram bitvectorLength;
-        statistics::Histogram responseQueueLatency;
-        statistics::Histogram memoryFunctionLatency;
-    };
-
-    CoalesceStats stats;
-
-  protected:
-    virtual void recvMemRetry() override;
-    virtual bool handleMemResp(PacketPtr pkt) override;
-
-  public:
-    PARAMS(CoalesceEngine);
-    CoalesceEngine(const Params &params);
-    void registerMPU(MPU* mpu);
-
-    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
-
-    bool recvWLRead(Addr addr);
-    void recvWLWrite(Addr addr, WorkListItem wl);
-
-    int workCount() { return _workCount; }
-    void recvVertexPull();
-
-    bool done();
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..8c9d223178
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB",
+    "LOCKED_FOR_APPLY"
+};
+
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..e7a8f84452
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    LOCKED_FOR_APPLY,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b30060238d..f661bd68a6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -87,12 +87,6 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
     pushEngine->recvVertexPush(addr, delta, edge_index, degree);
 }
 
-void
-MPU::recvPrevPullCorrection()
-{
-    pushEngine->recvPrevPullCorrection();
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f3b29f603..ad18a0d5a5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
@@ -64,10 +65,12 @@ class MPU : public SimObject
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
-    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
     void recvWorkload(GraphWorkload* Workload);
 
@@ -77,7 +80,6 @@ class MPU : public SimObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 07f37a28dc..a17991e335 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -155,13 +155,13 @@ void
 PushEngine::start()
 {
     assert(!_running);
-    assert(!nextVertexPullEvent.scheduled());
+    // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
     stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
-    if (vertexSpace()) {
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 }
@@ -169,17 +169,16 @@ PushEngine::start()
 void
 PushEngine::processNextVertexPullEvent()
 {
-    // TODO: change edgePointerQueueSize
-    numPendingPulls++;
-    owner->recvVertexPull();
-
-    if (!workLeft()) {
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
         _running = false;
         lastIdleEntranceTick = curTick();
-    }
-
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
     }
 }
 
@@ -197,9 +196,9 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
-
     numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
@@ -209,16 +208,6 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
     }
 }
 
-void
-PushEngine::recvPrevPullCorrection()
-{
-    assert(numPendingPulls > 0);
-    numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
-    }
-}
-
 void
 PushEngine::processNextMemoryReadEvent()
 {
@@ -255,7 +244,7 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2e1de25390..08cceb14f0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -199,7 +200,6 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvReqRetry();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a698f2cc0a..2b305e1557 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -134,7 +134,7 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::checkRetryReq()
 {
-    for (int i = 0; i < inPorts.size(); ++i) {
+    for (int i = 0; i < inPorts.size(); i++) {
         inPorts[i].checkRetryReq();
     }
 }
@@ -191,12 +191,8 @@ WLEngine::processNextReadEvent()
         if (registerFile.size() < registerFileSize) {
             DPRINTF(WLEngine, "%s: There are free registers available in the "
                                             "registerFile.\n", __func__);
-            // TODO: It might be a good idea for WLEngine to act differently
-            // on cache rejects. As a first step the cache should not just
-            // return a boolean value. It should return an integer/enum
-            // to tell WLEngine why it rejected the read request. Their might
-            // be things that WLEngine can do to fix head of the line blocking.
-            if (owner->recvWLRead(update_addr)) {
+            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+            if (read_status == ReadReturnStatus::ACCEPT) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -209,7 +205,8 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+                stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -220,6 +217,17 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
+            } else {
+                if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                    updateQueue.pop_front();
+                    updateQueue.emplace_back(
+                                        update_addr, update_value, enter_tick);
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Rolling the update.\n", __func__);
+                } else {
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                    "Not rolling the update.\n", __func__);
+                }
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -227,7 +235,6 @@ WLEngine::processNextReadEvent()
             stats.registerShortage++;
         }
     } else {
-        // TODO: Generalize this to reduce function rather than just min
         DPRINTF(WLEngine,  "%s: A register has already been allocated for "
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
@@ -238,7 +245,8 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+        stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f442d6060e..b5ad3d9040 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -35,6 +35,7 @@
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..4102e29cd3
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual void activate(Addr atom_addr) = 0;
+    virtual void deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t currentIndex;
+    uint32_t currentCounter;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        currentIndex(0), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual void activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual void deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: If this function returns an addr that
+    // is in the cache, that addr should be ignored.
+    // CAUTION: The receiver should track the last n
+    // addresses that this WorkDirectory has generated.
+    // where n is equal to the size of the entry holding
+    // reads generated by this WorkDirectory. In case
+    // the WorkDirectory generates a repeated address
+    // it should be ignored.
+    // FIXME: This should return garbage if it can't find anything.
+    // virtual Addr getNextWork()
+    // {
+    //     if ((currentCounter == numAtomsPerBlock) ||
+    //         (popCount[currentIndex] == 0)) {
+    //         int prev_index = currentIndex;
+    //         while (true) {
+    //             currentIndex++;
+    //             // NOTE: this is an optimization.
+    //             // lastCounterIndex tracks the last blockOfAtom that
+    //             // has vertices. By default it is set to numCounters - 1.
+    //             // However, it might not be necessary to track all the
+    //             // numCounters counters. e.g. If this WorkDirectory is tracking
+    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
+    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
+    //             // However, if we need 8 Mi atoms to store all our vertices,
+    //             // the second half of the counters would not be used at all
+    //             // (512 MiB hold 16 Mi atoms and we're only using half).
+    //             if (currentIndex > lastCounterIndex) {
+    //                 currentIndex = 0;
+    //             }
+    //             if (prev_index == currentIndex) {
+    //                 // NOTE: If we have reached the same index as before,
+    //                 // we need to decrement the currentCounter to generate
+    //                 // a repeatative address. This way the receiver can detect
+    //                 // the uselessness of the generated address and ignore it
+    //                 currentCounter--;
+    //                 break;
+    //             }
+    //             if (popCount[currentIndex] > 0) {
+    //                 currentCounter = 0;
+    //                 break;
+    //             }
+    //         }
+    //     }
+    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+    //     currentCounter++;
+
+    //     return ret_addr;
+    // }
+
+    virtual Addr getNextWork()
+    {
+        if ((currentCounter == numAtomsPerBlock) ||
+            (popCount[currentIndex] == 0)) {
+            int other_count = _workCount - popCount[currentIndex];
+            if (other_count == 0) {
+                currentCounter = 0;
+            } else {
+                int prev_index = currentIndex;
+                while (true) {
+                    currentIndex++;
+                    if (currentIndex > lastCounterIndex) {
+                        currentIndex = 0;
+                    }
+                    if (currentIndex == prev_index) {
+                        break;
+                    }
+                    if (popCount[currentIndex] > 0) {
+                        break;
+                    }
+                }
+                currentCounter = 0;
+            }
+        }
+        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 9a3600f331..6344e7c228 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -211,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 

From c24c8f886796b7462e03731138ce84e129f3e6c3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 19:53:35 -0800
Subject: [PATCH 212/279] Adding new stats.

---
 configs/accl/sega.py                   | 12 ++++--
 src/accl/graph/sega/CoalesceEngine.py  |  2 -
 src/accl/graph/sega/coalesce_engine.cc | 51 ++++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.cc     | 16 ++++++--
 src/accl/graph/sega/push_engine.hh     |  5 ++-
 6 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54f22b1377..7baa27fd5e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -53,7 +53,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
-            num_mshr_entry=64,
             max_resp_per_cycle=8,
             active_buffer_size = 64,
             post_push_wb_queue_size=64,
@@ -61,7 +60,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=512,
+            resp_queue_size=4096,
             update_queue_size=32,
         )
 
@@ -74,7 +73,11 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-
+        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
+        #                                 latency_var="0ns",
+        #                                 bandwidth="18GiB/s",
+        #                                 range=AddrRange(edge_memory_size),
+        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
@@ -105,6 +108,9 @@ def set_vertex_pch_bit(self, pch_bit):
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
+    # def set_edge_image(self, edge_image):
+    #     self.edge_mem_ctrl.image_file = edge_image
+
 
 
 class SEGA(System):
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a447dedc3d..76e7d262e8 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,8 +37,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0aa61345f7..d7cf173097 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,7 +46,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0),
     maxRespPerCycle(params.max_resp_per_cycle),
     pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
     activeBufferSize(params.active_buffer_size),
@@ -227,7 +227,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
 
-        assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
         MSHR[block_index].push_back(addr);
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
@@ -239,7 +238,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
 
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
@@ -284,29 +282,26 @@ CoalesceEngine::recvWLRead(Addr addr)
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
-            if (MSHR.size() < numMSHREntries) {
-                cacheBlocks[block_index].addr = aligned_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].dirty = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
-                cacheBlocks[block_index].lastChangedTick = curTick();
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
 
-                MSHR[block_index].push_back(addr);
-                memoryFunctionQueue.emplace_back(
-                    [this] (int block_index, Tick schedule_tick) {
-                        processNextRead(block_index, schedule_tick);
-                    }, block_index, curTick());
-                if ((!nextMemoryEvent.pending()) &&
-                    (!nextMemoryEvent.scheduled())) {
-                    schedule(nextMemoryEvent, nextCycle());
-                }
-                return ReadReturnStatus::ACCEPT;
-            } else {
-                return ReadReturnStatus::REJECT_ROLL;
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
             }
+            return ReadReturnStatus::ACCEPT;
         }
+        stats.readMisses++;
     }
 }
 
@@ -939,6 +934,8 @@ CoalesceEngine::processNextApplyEvent()
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pkt->deleteData();
@@ -986,6 +983,8 @@ CoalesceEngine::processNextApplyEvent()
                             cacheBlocks[block_index].items[index].edgeIndex,
                             cacheBlocks[block_index].items[index].degree);
                         pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
                     }
                 }
 
@@ -1057,8 +1056,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
@@ -1082,7 +1079,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+    ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
@@ -1103,7 +1100,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    bitvectorLength.init(64);
+    frontierSize.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c457b214f9..f87e0027a2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,6 @@ class CoalesceEngine : public BaseMemoryEngine
     Block* cacheBlocks;
 
     int onTheFlyReqs;
-    int numMSHREntries;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
     // Response route to WLEngine
@@ -167,7 +166,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;
@@ -180,7 +178,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram bitvectorLength;
+        statistics::Histogram frontierSize;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a17991e335..09f29a43e4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,7 +158,7 @@ PushEngine::start()
     // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
-    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
@@ -196,6 +196,7 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
     numPendingPulls--;
 
     if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -239,6 +240,7 @@ PushEngine::processNextMemoryReadEvent()
             stats.edgePointerQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
@@ -282,6 +284,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
     }
     stats.numWastefulEdgesRead +=
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
@@ -320,6 +323,7 @@ PushEngine::processNextPropagateEvent()
             stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            stats.edgeQueueLength.sample(metaEdgeQueue.size());
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
@@ -466,8 +470,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
-    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
-             "Number of cycles PushEngine has been idle."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
     ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
              "Number of coalescions in the update queues."),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
@@ -479,8 +483,12 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Traversed Edges Per Second."),
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
     ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
@@ -496,7 +504,9 @@ PushEngine::PushStats::regStats()
     TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
     edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
     updateQueueLength.init(64);
     numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 08cceb14f0..f51865acb3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -164,9 +164,10 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
+      statistics::Scalar numMemoryBlocks;
       statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
-      statistics::Scalar numIdleCycles;
+    //   statistics::Scalar numIdleCycles;
       statistics::Scalar updateQueueCoalescions;
       statistics::Scalar numUpdates;
       statistics::Scalar numWastefulEdgesRead;
@@ -174,7 +175,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
       statistics::Histogram updateQueueLength;
       statistics::Histogram numPropagatesHist;
     };

From a86d7b1c4226e2aaca508346c03518093058e5fa Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 07:36:05 -0800
Subject: [PATCH 213/279] Adding state.

---
 configs/accl/bfs.py                    | 35 ++++++++++++++++++++------
 configs/accl/sega.py                   |  6 +----
 src/accl/graph/sega/PushEngine.py      |  4 +--
 src/accl/graph/sega/coalesce_engine.cc | 26 +++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/work_directory.hh  | 10 +++++---
 6 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index a201acd4d1..80331e3aad 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -47,6 +47,14 @@ def get_inputs():
         default=False,
         help="Print final answer",
     )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample statistics",
+    )
 
     args = argparser.parse_args()
 
@@ -56,24 +64,37 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample,
         args.verify,
     )
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
-    system.create_pop_count_directory(256)
+    system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7baa27fd5e..29a017ba65 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,6 +61,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=4096,
+            max_propagates_per_cycle=8,
             update_queue_size=32,
         )
 
@@ -73,11 +74,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
-        #                                 latency_var="0ns",
-        #                                 bandwidth="18GiB/s",
-        #                                 range=AddrRange(edge_memory_size),
-        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 20c5452d43..63fa1eae62 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -42,8 +42,8 @@ class PushEngine(BaseMemoryEngine):
                                     "push engine where it stores the "
                                     "edges read from memory.")
 
-    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
-                                            "done per cycle.")
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
 
     update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7cf173097..adb33064f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -239,7 +239,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-
+        stats.readMisses++;
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
             // conflict miss
             DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
@@ -268,7 +268,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     if (atom_active) {
                         activeCacheBlocks.erase(block_index);
-                        directory->activate(cacheBlocks[block_index].addr);
+                        int count = directory->activate(cacheBlocks[block_index].addr);
+                        stats.blockActiveCount.sample(count);
+                        stats.frontierSize.sample(directory->workCount());
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -301,7 +303,6 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             return ReadReturnStatus::ACCEPT;
         }
-        stats.readMisses++;
     }
 }
 
@@ -376,8 +377,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                             cacheBlocks[block_index].items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeCacheBlocks.push_back(block_index);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -433,8 +436,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active |= graphWorkload->activeCondition(items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
                 DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
                         "activeBuffer.size: %d.\n", __func__,
                         pkt->print(), activeBuffer.size());
@@ -591,7 +596,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 }
                 if (atom_active) {
                     activeCacheBlocks.erase(block_index);
-                    directory->activate(cacheBlocks[block_index].addr);
+                    int count = directory->activate(cacheBlocks[block_index].addr);
+                    stats.blockActiveCount.sample(count);
+                    stats.frontierSize.sample(directory->workCount());
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -804,7 +811,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         }
         if (atom_active) {
             activeCacheBlocks.erase(block_index);
-            directory->activate(cacheBlocks[block_index].addr);
+            int count = directory->activate(cacheBlocks[block_index].addr);
+            stats.blockActiveCount.sample(count);
+            stats.frontierSize.sample(directory->workCount());
         }
 
         PacketPtr pkt = createWritePacket(
@@ -1081,6 +1090,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Rate at which vertices are pushed."),
     ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1101,6 +1112,7 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
     frontierSize.init(64);
+    blockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f87e0027a2..b855fda38b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -179,6 +179,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPushBW;
 
         statistics::Histogram frontierSize;
+        statistics::Histogram blockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 4102e29cd3..35778686c8 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -38,8 +38,8 @@ namespace gem5
 class WorkDirectory
 {
   public:
-    virtual void activate(Addr atom_addr) = 0;
-    virtual void deactivate(Addr atom_addr) = 0;
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
     virtual Addr getNextWork() = 0;
 
     virtual int workCount() = 0;
@@ -99,7 +99,7 @@ class PopCountDirectory: public WorkDirectory
 
     // CAUTION: This should only be called when the work
     // directory **is not** tracking the the atom with atom_addr
-    virtual void activate(Addr atom_addr)
+    virtual int activate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -107,11 +107,12 @@ class PopCountDirectory: public WorkDirectory
         _workCount++;
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     // CAUTION: This should only be called when the work
     // directory **is** tracking the the atom with atom_addr
-    virtual void deactivate(Addr atom_addr)
+    virtual int deactivate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -119,6 +120,7 @@ class PopCountDirectory: public WorkDirectory
         _workCount--;
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     virtual int workCount() { return _workCount; }

From e38b1c045ce67ea4df986d88aa6fb4393d712756 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:00:00 -0800
Subject: [PATCH 214/279] Adding stat to count number of conflict misses.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 +++
 src/accl/graph/sega/coalesce_engine.hh | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index adb33064f7..8c636615cd 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -279,6 +279,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
                 return ReadReturnStatus::REJECT_NO_ROLL;
             } else {
+                stats.numConflicts++;
                 return ReadReturnStatus::REJECT_ROLL;
             }
         } else {
@@ -1065,6 +1066,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b855fda38b..c2da6a90cd 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -166,6 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;

From 614ba9237f57788c7b6dca1ac727c3b3eae7b622 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:17:20 -0800
Subject: [PATCH 215/279] Adding stat to count the number of update rolls.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 ---
 src/accl/graph/sega/enums.cc           | 3 +--
 src/accl/graph/sega/enums.hh           | 1 -
 src/accl/graph/sega/wl_engine.cc       | 4 ++++
 src/accl/graph/sega/wl_engine.hh       | 1 +
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8c636615cd..b9ac25c502 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -179,9 +179,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
-            return ReadReturnStatus::REJECT_NO_ROLL;
-        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 8c9d223178..de5d569c18 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -36,8 +36,7 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_DATA",
     "BUSY",
     "IDLE",
-    "PENDING_WB",
-    "LOCKED_FOR_APPLY"
+    "PENDING_WB"
 };
 
 
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index e7a8f84452..6153386b71 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -39,7 +39,6 @@ enum CacheState
     BUSY,
     IDLE,
     PENDING_WB,
-    LOCKED_FOR_APPLY,
     NUM_CACHE_STATE
 };
 extern const char* cacheStateStrings[NUM_CACHE_STATE];
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2b305e1557..ed91622b43 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -224,6 +224,7 @@ WLEngine::processNextReadEvent()
                                         update_addr, update_value, enter_tick);
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                         "Rolling the update.\n", __func__);
+                    stats.numUpdateRolls++;
                 } else {
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                     "Not rolling the update.\n", __func__);
@@ -330,6 +331,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
              "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
     ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b5ad3d9040..45baaa1e79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -101,6 +101,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From c38cab0dbd651b7e6f2cf8280b857ef437dee4bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 19:47:35 -0800
Subject: [PATCH 216/279] Removing unnecessary comments.

---
 src/accl/graph/sega/coalesce_engine.cc | 52 +++-----------------------
 1 file changed, 5 insertions(+), 47 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b9ac25c502..98229dde24 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -86,6 +86,9 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         int block_index = getBlockIndex(addr);
 
         // FIXME: Check postPushWBQueue for hits
+        // Is it really the case though. I don't think at this time
+        // beacuse we check done after handleMemResp and make sure all
+        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -438,23 +441,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 activeBuffer.emplace_back(pkt, curTick());
                 stats.blockActiveCount.sample(count);
                 stats.frontierSize.sample(directory->workCount());
-                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             } else {
                 delete pkt;
             }
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     if ((!nextMemoryEvent.pending()) &&
-            //         (!nextMemoryEvent.scheduled())) {
-            //         schedule(nextMemoryEvent, nextCycle());
-            //     }
-            //     pullsScheduled++;
-            // }
+
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -685,9 +675,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
             delete wb_pkt;
-            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, postPushWBQueue.size());
         } else {
             wb++;
         }
@@ -707,16 +694,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
             delete ab_pkt;
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     pullsScheduled++;
-            // }
-            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, activeBuffer.size());
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -841,6 +818,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     if (postPushWBQueue.empty()) {
         return;
     }
+
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
@@ -848,9 +826,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
         memPort.sendPacket(wb_pkt);
         onTheFlyReqs++;
         postPushWBQueue.pop_front();
-        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
@@ -958,13 +933,7 @@ CoalesceEngine::processNextApplyEvent()
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
-            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
-                            "postPushWBQueue.size: %d.\n", __func__,
-                            wb_pkt->print(), postPushWBQueue.size());
             activeBuffer.pop_front();
-            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             memoryFunctionQueue.emplace_back(
                 [this] (int ignore, Tick schedule_tick) {
                     processNextPostPushWB(ignore, schedule_tick);
@@ -1020,17 +989,6 @@ CoalesceEngine::processNextApplyEvent()
                         "work to apply.\n", __func__);
     }
 
-    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-    //     memoryFunctionQueue.emplace_back(
-    //         [this] (int ignore, Tick schedule_tick) {
-    //             processNextVertexPull(ignore, schedule_tick);
-    //         }, 0, curTick());
-    //     if ((!nextMemoryEvent.pending()) &&
-    //         (!nextMemoryEvent.scheduled())) {
-    //         schedule(nextMemoryEvent, nextCycle());
-    //     }
-    //     pullsScheduled++;
-    // }
     if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
             [this] (int ignore, Tick schedule_tick) {

From 1169c19446a5d781cd477e92ffe03fc1c3ac45e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 22:17:20 -0800
Subject: [PATCH 217/279] Removing comments.

---
 src/accl/graph/sega/work_directory.hh | 103 ++++++++------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 35778686c8..18430aee0d 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 #define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
 #include "base/addr_range.hh"
 #include "base/types.hh"
 
@@ -63,9 +66,11 @@ class PopCountDirectory: public WorkDirectory
     int lastCounterIndex;
     uint32_t* popCount;
 
-    uint32_t currentIndex;
+    uint32_t prevIndex;
     uint32_t currentCounter;
 
+    UniqueFIFO<int> activeBlockIndices;
+
     int getIndexFromAtomAddr(Addr atom_addr)
     {
         assert((atom_addr % memoryAtomSize) == 0);
@@ -86,7 +91,7 @@ class PopCountDirectory: public WorkDirectory
         WorkDirectory(),
         memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
         memoryAtomSize(atom_size), _workCount(0),
-        currentIndex(0), currentCounter(0)
+        prevIndex(-1), currentCounter(0)
     {
         blockSize = numAtomsPerBlock * memoryAtomSize;
         int numCounters = (int) (memoryRange.size() / blockSize);
@@ -105,6 +110,7 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]++;
         _workCount++;
+        activeBlockIndices.push_back(index);
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -118,6 +124,9 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]--;
         _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -130,80 +139,28 @@ class PopCountDirectory: public WorkDirectory
         lastCounterIndex = getIndexFromAtomAddr(atom_addr);
     }
 
-    // CAUTION: If this function returns an addr that
-    // is in the cache, that addr should be ignored.
-    // CAUTION: The receiver should track the last n
-    // addresses that this WorkDirectory has generated.
-    // where n is equal to the size of the entry holding
-    // reads generated by this WorkDirectory. In case
-    // the WorkDirectory generates a repeated address
-    // it should be ignored.
-    // FIXME: This should return garbage if it can't find anything.
-    // virtual Addr getNextWork()
-    // {
-    //     if ((currentCounter == numAtomsPerBlock) ||
-    //         (popCount[currentIndex] == 0)) {
-    //         int prev_index = currentIndex;
-    //         while (true) {
-    //             currentIndex++;
-    //             // NOTE: this is an optimization.
-    //             // lastCounterIndex tracks the last blockOfAtom that
-    //             // has vertices. By default it is set to numCounters - 1.
-    //             // However, it might not be necessary to track all the
-    //             // numCounters counters. e.g. If this WorkDirectory is tracking
-    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
-    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
-    //             // However, if we need 8 Mi atoms to store all our vertices,
-    //             // the second half of the counters would not be used at all
-    //             // (512 MiB hold 16 Mi atoms and we're only using half).
-    //             if (currentIndex > lastCounterIndex) {
-    //                 currentIndex = 0;
-    //             }
-    //             if (prev_index == currentIndex) {
-    //                 // NOTE: If we have reached the same index as before,
-    //                 // we need to decrement the currentCounter to generate
-    //                 // a repeatative address. This way the receiver can detect
-    //                 // the uselessness of the generated address and ignore it
-    //                 currentCounter--;
-    //                 break;
-    //             }
-    //             if (popCount[currentIndex] > 0) {
-    //                 currentCounter = 0;
-    //                 break;
-    //             }
-    //         }
-    //     }
-    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
-    //     currentCounter++;
-
-    //     return ret_addr;
-    // }
-
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
     virtual Addr getNextWork()
     {
-        if ((currentCounter == numAtomsPerBlock) ||
-            (popCount[currentIndex] == 0)) {
-            int other_count = _workCount - popCount[currentIndex];
-            if (other_count == 0) {
-                currentCounter = 0;
-            } else {
-                int prev_index = currentIndex;
-                while (true) {
-                    currentIndex++;
-                    if (currentIndex > lastCounterIndex) {
-                        currentIndex = 0;
-                    }
-                    if (currentIndex == prev_index) {
-                        break;
-                    }
-                    if (popCount[currentIndex] > 0) {
-                        break;
-                    }
-                }
-                currentCounter = 0;
-            }
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
         }
-        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
         currentCounter++;
         return ret_addr;
     }

From a98dd0f9586ca3f0d7837ad9f9e1525d4816e106 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 09:05:29 -0800
Subject: [PATCH 218/279] Adding pr and updating config scripts.

---
 configs/accl/bfs.py                        |  24 ++--
 configs/accl/pr-sample.py                  | 109 --------------
 configs/accl/pr.py                         |  44 +++++-
 configs/accl/sega.py                       |  36 +++--
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++------------
 src/accl/graph/base/graph_workload.hh      |  38 ++---
 src/accl/graph/sega/CenteralController.py  |   2 +-
 src/accl/graph/sega/CoalesceEngine.py      |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 +-
 src/accl/graph/sega/centeral_controller.hh |   2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  53 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   5 +-
 12 files changed, 201 insertions(+), 280 deletions(-)
 delete mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 80331e3aad..829449c599 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -40,20 +40,20 @@ def get_inputs():
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
-        "--verify",
-        dest="verify",
+        "--sample",
+        dest="sample",
         action="store_const",
         const=True,
         default=False,
-        help="Print final answer",
+        help="Sample sim stats every 100us",
     )
     argparser.add_argument(
-        "--sample",
-        dest="sample",
+        "--verify",
+        dest="verify",
         action="store_const",
         const=True,
         default=False,
-        help="Sample statistics",
+        help="Print final answer",
     )
 
     args = argparser.parse_args()
@@ -70,7 +70,15 @@ def get_inputs():
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
@@ -81,7 +89,7 @@ def get_inputs():
     system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
-            exit_event = m5.simulate(10000000)
+            exit_event = m5.simulate(100000000)
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
deleted file mode 100644
index ac3616dc84..0000000000
--- a/configs/accl/pr-sample.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from sega import SEGA
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 10us",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.cache_size,
-        args.graph,
-        args.alpha,
-        args.threshold,
-        args.verify,
-        args.sample,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        cache_size,
-        graph,
-        alpha,
-        threshold,
-        verify,
-        sample,
-    ) = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.create_pr_workload(alpha, threshold)
-
-    if sample:
-        while True:
-            exit_event = m5.simulate(10000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            print(exit_event.getCause())
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 59e8b924c6..e852e47561 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -56,23 +64,45 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.sample,
         args.verify,
     )
 
-
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
+    system.create_pop_count_directory(64)
     system.create_pr_workload(alpha, threshold)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 29a017ba65..7831302228 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,14 +47,18 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
+    ):
         super().__init__()
+        self._simple_mem = simple_mem
         self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            active_buffer_size = 64,
+            pending_pull_limit=32,
+            active_buffer_size=64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -65,9 +69,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
-        )
+        if self._simple_mem:
+            self.vertex_mem_ctrl = SimpleMemory(
+                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            )
+        else:
+            self.vertex_mem_ctrl = HBMCtrl(
+                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+            )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
@@ -96,18 +106,20 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        if self._simple_mem:
+            self.vertex_mem_ctrl.range = vertex_ranges[0]
+        else:
+            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
+        if self._simple_mem:
+            pass
+        else:
+            self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
-    # def set_edge_image(self, edge_image):
-    #     self.edge_mem_ctrl.image_file = edge_image
-
-
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 446509201f..0539296cce 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,92 +113,75 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-// {
-//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-// }
-
-// void
-// PRWorkload::init(PacketPtr pkt, int bit_index_base,
-//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-//                 std::deque<int>& activeBits,
-//                 int& _workCount)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
-//     for (int i = 0; i < numElementsPerLine; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         if (items[i].degree > 0) {
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//             _workCount++;
-//         }
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = 1.0;
-
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::applyCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return dist >= threshold;
-// }
-
-// bool
-// PRWorkload::preWBApply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl) && (wl.degree > 0)) {
-//         return true;
-//     }
-//     return false;
-// }
-
-// std::tuple<uint32_t, bool, bool>
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl)) {
-//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//         float prop_float = writeToFloat<uint32_t>(wl.prop);
-//         float delta = (temp_float - prop_float) / wl.degree;
-//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//         wl.prop = wl.tempProp;
-//         return std::make_tuple(delta_uint, true, true);
-//     }
-//     return std::make_tuple(0, false, false);
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, temp_float, wl.degree, wl.edgeIndex
-//             );
-// }
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    wl.prop = wl.tempProp;
+    return delta_uint;
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex);
+}
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f71955bd16..f335ad9b47 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -77,24 +77,26 @@ class BFSWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
-
-//   public:
-//     PRWorkload(float alpha, float threshold);
-
-//     ~PRWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0c21833a05..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 76e7d262e8..c2393c2f1e 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -39,6 +39,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
                                 "atoms ready to send updates. This parameter "
                                 "and post_push_wb_queue_size should be set "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 883992e64e..60c78559e4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -110,11 +110,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-// }
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
 
 void
 CenteralController::recvDoneSignal()
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 6eb07dbcac..ae2980d050 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -63,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 98229dde24..8ac40198be 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0),
-    maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
-    activeBufferSize(params.active_buffer_size),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -129,29 +129,17 @@ CoalesceEngine::done()
 }
 
 bool
-CoalesceEngine::timeToPull()
+CoalesceEngine::enoughSpace()
 {
-    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
-}
-
-bool
-CoalesceEngine::canSchedulePull()
-{
-    // TODO: Maybe a good idea to change this to
-    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
-    return pullsScheduled < 1;
-}
-
-bool
-CoalesceEngine::workLeftInMem()
-{
-    return !directory->empty();
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
 }
 
 bool
 CoalesceEngine::pullCondition()
 {
-    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -784,12 +772,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
             atom_active |= graphWorkload->activeCondition(
                                         cacheBlocks[block_index].items[index]);
         }
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
-            int count = directory->activate(cacheBlocks[block_index].addr);
-            stats.blockActiveCount.sample(count);
-            stats.frontierSize.sample(directory->workCount());
-        }
 
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
@@ -797,8 +779,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
+        if (atom_active) {
+            activeCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = directory->activate(cacheBlocks[block_index].addr);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
         cacheBlocks[block_index].reset();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c2da6a90cd..f605704b6d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> activeCacheBlocks;
 
     int pullsScheduled;
+    int pendingPullLimit;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
@@ -128,9 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
-    bool timeToPull();
-    bool canSchedulePull();
-    bool workLeftInMem();
+    bool enoughSpace();
     bool pullCondition();
     int getBlockIndex(Addr addr);
 

From b3d678a97b046ed47835e4be83e7965cd92a3566 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 21:24:39 -0800
Subject: [PATCH 219/279] Updating activeCondition for PR.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 0539296cce..05c8d05089 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -160,7 +160,7 @@ PRWorkload::activeCondition(WorkListItem wl)
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
+    return (dist >= threshold) && (wl.degree > 0);
 }
 
 uint32_t

From e3ef860a82550c90b7c1ca96b1fbbd534c90b295 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Nov 2022 15:36:40 -0800
Subject: [PATCH 220/279] Adding SSSP and CC

---
 src/accl/graph/base/graph_workload.cc | 172 ++++++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh |  58 +++++++++
 2 files changed, 230 insertions(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 05c8d05089..e36c074da9 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,6 +113,121 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
+void
+BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return 1;
+}
+
+bool
+BFSVisitedWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+BFSVisitedWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+void
+SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+SSSPWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+bool
+SSSPWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+SSSPWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+SSSPWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+
 void
 PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -184,4 +299,61 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
 
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
+        items[i].prop = -1;
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+CCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+CCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+bool
+CCWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+CCWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+CCWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f335ad9b47..de2877d6e8 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -76,6 +76,48 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class BFSVisitedWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSVisitedWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class SSSPWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~SSSPWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 
 class PRWorkload : public GraphWorkload
 {
@@ -98,6 +140,22 @@ class PRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class CCWorkload : public GraphWorkload
+{
+
+  public:
+    CCWorkload() {}
+
+    ~CCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 }
 
 #endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__

From c56b386a4645e4ff325827e087ca78e65ea59ab6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 11 Nov 2022 14:40:50 -0800
Subject: [PATCH 221/279] Adding option to use SimpleMemory for vertex memory.

---
 configs/accl/bfs.py                           |  17 ++-
 configs/accl/pr.py                            |  20 ++-
 configs/accl/real-graph-gen.py                |  16 ++-
 configs/accl/sega.py                          |  34 ++---
 .../accl/{sega-simple.py => sega_simple.py}   | 133 ++++++++----------
 5 files changed, 113 insertions(+), 107 deletions(-)
 rename configs/accl/{sega-simple.py => sega_simple.py} (50%)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 829449c599..806aa8a915 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,6 +71,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.simple,
         args.sample,
         args.verify,
     )
@@ -76,10 +84,15 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e852e47561..e3d7c764ad 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,10 +71,12 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.simple,
         args.sample,
         args.verify,
     )
 
+
 if __name__ == "__m5_main__":
     (
         num_gpts,
@@ -75,10 +84,15 @@ def get_inputs():
         graph,
         alpha,
         threshold,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
@@ -95,7 +109,6 @@ def get_inputs():
             )
             m5.stats.dump()
             m5.stats.reset()
-            print(exit_event.getCause())
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
@@ -106,3 +119,4 @@ def get_inputs():
         )
     if verify:
         system.print_answer()
+
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index b943a925c1..332bb67452 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -45,8 +45,11 @@ def get_inputs():
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
+    graph_sorter = os.environ.get("GRAPH_SORTER")
     graph_reader = os.environ.get("GRAPH_READER")
 
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER.")
     if graph_reader is None:
         raise ValueError(f"No value for $GRAPH_READER.")
 
@@ -54,6 +57,17 @@ def get_inputs():
         raise ValueError(f"{graph_path} does not exist.")
 
     graph_dir = os.path.dirname(graph_path)
+    sorted_graph = f"{graph_dir}/sorted_graph.txt"
+    if not os.path.exists(sorted_graph):
+        print(f"Sorting {graph_path} into {sorted_graph}.")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}",
+                f"{sorted_graph}",
+            ]
+        )
     if not "binaries" in os.listdir(graph_dir):
         print(f"binaries directory not found in {graph_dir}")
         os.mkdir(f"{graph_dir}/binaries")
@@ -80,7 +94,7 @@ def get_inputs():
         subprocess.run(
             [
                 f"{graph_reader}",
-                f"{graph_path}",
+                f"{sorted_graph}",
                 "false",
                 f"{num_gpts}",
                 "32",
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7831302228..1ea36ea49e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,11 +48,9 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 class GPT(SubSystem):
     def __init__(
-        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
-    ):
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self._simple_mem = simple_mem
-        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
@@ -69,20 +67,14 @@ def __init__(
             update_queue_size=32,
         )
 
-        if self._simple_mem:
-            self.vertex_mem_ctrl = SimpleMemory(
-                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
-            )
-        else:
-            self.vertex_mem_ctrl = HBMCtrl(
-                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-            )
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+        )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False
-            )
+                range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -106,17 +98,11 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        if self._simple_mem:
-            self.vertex_mem_ctrl.range = vertex_ranges[0]
-        else:
-            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        if self._simple_mem:
-            pass
-        else:
-            self.vertex_mem_ctrl.pch_bit = pch_bit
+        self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega_simple.py
similarity index 50%
rename from configs/accl/sega-simple.py
rename to configs/accl/sega_simple.py
index 7ec19c92ae..f59fa71a79 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega_simple.py
@@ -24,90 +24,87 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
     intlv_low_bit = log(cache_line_size, 2)
     intlv_bits = log(num_channels, 2)
     ret = []
     for i in range(num_channels):
-        ret.append(AddrRange(
-            start=plain_range.start,
-            size=plain_range.size(),
-            intlvHighBit=intlv_low_bit + intlv_bits - 1,
-            xorHighBit=0,
-            intlvBits=intlv_bits,
-            intlvMatch=i))
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
     return ret
 
+
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32,
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="0ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+        
+        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
+        
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False)
+        )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
+
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
@@ -115,14 +112,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+            AddrRange(start=0, size="4GiB"), num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("4GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
@@ -134,32 +129,16 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
 
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
 
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")

From 6e749d3ed3cbf72378a896507c4256b09616f19c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:55:54 -0800
Subject: [PATCH 222/279] Removing graph gen scripts and moved to sega-utils.

---
 configs/accl/real-graph-gen.py  | 107 ------------------------
 configs/accl/synth-graph-gen.py | 139 --------------------------------
 2 files changed, 246 deletions(-)
 delete mode 100644 configs/accl/real-graph-gen.py
 delete mode 100644 configs/accl/synth-graph-gen.py

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
deleted file mode 100644
index 332bb67452..0000000000
--- a/configs/accl/real-graph-gen.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.path, args.num_gpts
-
-
-if __name__ == "__main__":
-    graph_path, num_gpts = get_inputs()
-
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    graph_reader = os.environ.get("GRAPH_READER")
-
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-
-    if not os.path.exists(graph_path):
-        raise ValueError(f"{graph_path} does not exist.")
-
-    graph_dir = os.path.dirname(graph_path)
-    sorted_graph = f"{graph_dir}/sorted_graph.txt"
-    if not os.path.exists(sorted_graph):
-        print(f"Sorting {graph_path} into {sorted_graph}.")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}",
-                f"{sorted_graph}",
-            ]
-        )
-    if not "binaries" in os.listdir(graph_dir):
-        print(f"binaries directory not found in {graph_dir}")
-        os.mkdir(f"{graph_dir}/binaries")
-        print(f"Created {graph_dir}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
-        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{sorted_graph}",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_dir}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_dir}/binaries/gpts_{num_gpts}"
-        )
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
deleted file mode 100644
index 15e4a6eff2..0000000000
--- a/configs/accl/synth-graph-gen.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument(
-        "scale", type=int, help="The scale of the synth graph to generate."
-    )
-    argparser.add_argument(
-        "deg",
-        type=int,
-        help="The average degree of the synth graph to generate.",
-    )
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.scale, args.deg, args.num_gpts
-
-
-if __name__ == "__main__":
-    scale, deg, num_gpts = get_inputs()
-
-    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-    graph_gen = os.environ.get("GRAPH_GEN")
-    graph_reader = os.environ.get("GRAPH_READER")
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    if graph_gen is None:
-        raise ValueError(f"No value for $GRAPH_GEN.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER")
-
-    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
-    if not os.path.exists(graph_path):
-        print(f"{graph_path} does not exist already.")
-        os.mkdir(graph_path)
-        print(f"Created {graph_path}")
-
-    if not "graph.txt" in os.listdir(graph_path):
-        print(f"graph.txt not found in {graph_path}")
-        for delete in os.scandir(graph_path):
-            os.remove(delete.path)
-        print(f"Deleted everything in {graph_path}")
-        subprocess.run(
-            [
-                f"{graph_gen}",
-                f"{scale}",
-                f"{deg}",
-                f"{graph_path}/graph_unordered.txt",
-            ]
-        )
-        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}/graph_unordered.txt",
-                f"{graph_path}/graph.txt",
-            ]
-        )
-        print(
-            f"Sorted the graph here {graph_path}/graph_unordered.txt"
-            f" and saved in {graph_path}/graph.txt"
-        )
-        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-        print(f"Deleted {graph_path}/graph_unordered.txt")
-
-    if not "binaries" in os.listdir(graph_path):
-        print(f"binaries directory not found in {graph_path}")
-        os.mkdir(f"{graph_path}/binaries")
-        print(f"Created {graph_path}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
-        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(
-            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{graph_path}/graph.txt",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_path}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_path}/binaries/gpts_{num_gpts}"
-        )

From 51b12cd7a835edb8743353e89367afd3c4b17dfe Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:57:19 -0800
Subject: [PATCH 223/279] Adding BSP mode.

---
 src/accl/graph/base/data_structs.hh        |  30 ++-
 src/accl/graph/base/graph_workload.hh      |   2 +-
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/centeral_controller.cc |  66 ++++--
 src/accl/graph/sega/centeral_controller.hh |  10 +-
 src/accl/graph/sega/coalesce_engine.cc     | 257 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  17 +-
 src/accl/graph/sega/enums.cc               |  15 +-
 src/accl/graph/sega/enums.hh               |  18 ++
 src/accl/graph/sega/mpu.hh                 |   4 +
 11 files changed, 308 insertions(+), 117 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 84233ae39c..f09a0dd167 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -43,28 +43,34 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
 
     std::string to_string()
     {
         return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                            "degree: %u}", tempProp, prop, edgeIndex, degree);
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        activeNow(false),
+        activeFuture(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
-        tempProp(temp_prop),
-        prop(prop),
-        degree(degree),
-        edgeIndex(edge_index)
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
     {}
 
 };
@@ -158,6 +164,10 @@ class UniqueFIFO
         return fifo.size();
     }
 
+    void clear() {
+        fifo.clear();
+    }
+
     bool empty() {
         return fifo.empty();
     }
@@ -174,6 +184,10 @@ class UniqueFIFO
         assert(it != fifo.end());
         fifo.erase(it);
     }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        fifo = rhs.fifo;
+    }
 };
 
 }
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index de2877d6e8..14a6561ae3 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,7 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
-    virtual bool activeCondition(WorkListItem wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..8b43c90102 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,6 +42,9 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
+                    PyBindMethod("setAsyncMode"),
+                    PyBindMethod("setBSPMode"),
+                    PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index c2393c2f1e..25f8a1c58b 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,7 +27,6 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -48,5 +47,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
-
-    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 60c78559e4..6c924a4703 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -42,7 +42,9 @@ namespace gem5
 
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system)
+    system(params.system),
+    mode(ProcessingMode::NOT_SET),
+    state(BulkSynchronousState::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,11 +52,41 @@ CenteralController::CenteralController(const Params& params):
     }
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.")
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
 void
 CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
 
@@ -83,7 +115,7 @@ CenteralController::startup()
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
-        if (!mpu->running() && (mpu->workCount()> 0)) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
@@ -104,18 +136,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-void
-CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
-{
-    workload = new BFSWorkload(init_addr, init_value);
-}
-
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
-
 void
 CenteralController::recvDoneSignal()
 {
@@ -124,9 +144,25 @@ CenteralController::recvDoneSignal()
         done &= mpu->done();
     }
 
-    if (done) {
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
         exitSimLoopNow("no update left to process.");
     }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        assert(state != BulkSynchronousState::DONT_CARE);
+        if (state == BulkSynchronousState::APPLYING) {
+            // TODO:
+            // 1- Toggle directories
+            // 2- Check if termination condition is met
+            // 3- If yes, schedule exit event,
+            // 4- If not switch state to consuming.
+            exitSimLoopNow("applying done.");
+        } else if (state == BulkSynchronousState::CONSUMING) {
+            // TODO:
+            // Schedule Bulk apply
+            exitSimLoopNow("consuming done.");
+        }
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ae2980d050..ab0e0c0c09 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "params/CenteralController.hh"
@@ -46,9 +47,11 @@ class CenteralController : public ClockedObject
 {
   private:
     System* system;
-
     Addr maxVertexAddr;
 
+    ProcessingMode mode;
+    BulkSynchronousState state;
+
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
@@ -62,6 +65,11 @@ class CenteralController : public ClockedObject
     CenteralController(const CenteralControllerParams &params);
     virtual void startup() override;
 
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createPRWorkload(float alpha, float threshold);
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8ac40198be..bfe3fe21b8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,7 +34,6 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -43,7 +42,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params), lastAtomAddr(0),
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
@@ -77,6 +76,8 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+
+// NOTE: Used for initializing memory and reading the final answer
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -85,10 +86,6 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // FIXME: Check postPushWBQueue for hits
-        // Is it really the case though. I don't think at this time
-        // beacuse we check done after handleMemResp and make sure all
-        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -100,7 +97,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        graphWorkload->init(pkt, directory);
+        graphWorkload->init(pkt, currentDirectory);
         if (pkt->getAddr() > lastAtomAddr) {
             lastAtomAddr = pkt->getAddr();
         }
@@ -111,21 +108,46 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 void
 CoalesceEngine::postMemInitSetup()
 {
-    directory->setLastAtomAddr(lastAtomAddr);
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
 void
-CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
-    directory = new PopCountDirectory(
+    currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectroy(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
 }
 
 bool
 CoalesceEngine::done()
 {
-    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
-        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
 }
 
 bool
@@ -249,16 +271,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                     // NOTE: The cache block could still be active but
                     // not dirty. If active we only have to active tracking
                     // but can throw the data away.
-                    bool atom_active = false;
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
                     for (int index = 0; index < numElementsPerLine; index++) {
-                        atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                     }
-                    if (atom_active) {
-                        activeCacheBlocks.erase(block_index);
-                        int count = directory->activate(cacheBlocks[block_index].addr);
-                        stats.blockActiveCount.sample(count);
-                        stats.frontierSize.sample(directory->workCount());
+                    if (atom_active_now) {
+                        currentActiveCacheBlocks.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        // stats.blockActiveCount.sample(count);
+                        // stats.frontierSize.sample(directory->workCount());
+                    }
+                    if (atom_active_future) {
+                        futureActiveCacheBlocks.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -360,16 +387,21 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // Since it is going to the cache, cache will be responsible for
             // tracking this. Push to activeCacheBlocks for simulator speed
             // instead of having to search for active blocks in the cache.
-            bool atom_active = false;
+            bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(
-                                            cacheBlocks[block_index].items[index]);
+                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
-                activeCacheBlocks.push_back(block_index);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+            if (atom_active_future) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -420,15 +452,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            bool atom_active = false;
+            bool atom_active_now = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(items[index]);
+                atom_active |= items[index].activeNow;
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
             } else {
                 delete pkt;
             }
@@ -486,6 +519,9 @@ CoalesceEngine::processNextResponseEvent()
         stats.responseQueueLatency.sample(
                                     waiting_ticks * 1e9 / getClockFrequency());
         if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
             if (!responseQueue.empty()) {
                 stats.responsePortShortage++;
             }
@@ -533,12 +569,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
         cacheBlocks[block_index].dirty |= true;
     }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
-        (!activeCacheBlocks.find(block_index))) {
-        activeCacheBlocks.push_back(block_index);
-        if (!owner->running()) {
-            owner->start();
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
         }
     }
 
@@ -565,16 +611,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     schedule(nextMemoryEvent, nextCycle());
                 }
             } else {
-                bool atom_active = false;
+                bool atom_active_now = false;
+                bool atom_active_future = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
-                if (atom_active) {
-                    activeCacheBlocks.erase(block_index);
-                    int count = directory->activate(cacheBlocks[block_index].addr);
-                    stats.blockActiveCount.sample(count);
-                    stats.frontierSize.sample(directory->workCount());
+                if (atom_active_now) {
+                    // TODO: Sample frontier size and blockCount here.
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    // stats.blockActiveCount.sample(count);
+                    // stats.frontierSize.sample(directory->workCount());
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -586,6 +638,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
+
     if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
@@ -623,6 +676,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
+    // FIXME: done() might have a different meaning depending on
+    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -659,6 +714,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
@@ -677,7 +742,19 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            activeCacheBlocks.push_back(block_index);
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
@@ -767,10 +844,11 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 
         // NOTE: If the atom we're writing back is active, we have to
         // stop tracking it in the cache and start tracking it in the memory.
-        bool atom_active = false;
+        bool atom_active_now = false;
+        bool atom_active_future = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
         }
 
         PacketPtr pkt = createWritePacket(
@@ -779,18 +857,25 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
             if (enoughSpace()) {
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
-                int count = directory->activate(cacheBlocks[block_index].addr);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+            }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
         }
@@ -810,17 +895,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
-    if (postPushWBQueue.empty()) {
-        return;
-    }
-
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        onTheFlyReqs++;
-        postPushWBQueue.pop_front();
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
     }
 }
 
@@ -828,8 +920,8 @@ void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
     pullsScheduled--;
-    if (!directory->empty()) {
-        Addr addr = directory->getNextWork();
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
         int block_index = getBlockIndex(addr);
 
         bool in_cache = cacheBlocks[block_index].addr == addr;
@@ -875,8 +967,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() +
-            directory->workCount() + activeBuffer.size();
+    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -905,9 +996,10 @@ CoalesceEngine::processNextApplyEvent()
         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
 
         for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-            if (graphWorkload->activeCondition(items[index])) {
+            if (items[index].activeNow) {
                 Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
                 uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
@@ -919,12 +1011,12 @@ CoalesceEngine::processNextApplyEvent()
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 
-        bool atom_active = false;
+        bool atom_active_now = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(items[index]);
+            atom_active_now |= items[index].activeNow;
         }
         // NOTE: If the atom is not active anymore.
-        if (!atom_active) {
+        if (!atom_active_now) {
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
@@ -946,9 +1038,10 @@ CoalesceEngine::processNextApplyEvent()
             int block_index = activeCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
                         Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
                         uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
                         cacheBlocks[block_index].dirty = true;
                         owner->recvVertexPush(addr, delta,
                             cacheBlocks[block_index].items[index].edgeIndex,
@@ -959,20 +1052,20 @@ CoalesceEngine::processNextApplyEvent()
                     }
                 }
 
-                bool atom_active = false;
+                bool atom_active_now = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 }
                 // NOTE: If we have reached the last item in the cache block
-                if (!atom_active) {
-                    activeCacheBlocks.erase(block_index);
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
                 }
                 break;
             }
             // NOTE: If the block with index at the front of activeCacheBlocks
             // is not in IDLE state, then roll the that index to the back
-            activeCacheBlocks.pop_front();
-            activeCacheBlocks.push_back(block_index);
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
             // NOTE: If we have visited all the items initially in the FIFO.
             num_visited_indices++;
             if (num_visited_indices == initial_fifo_length) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f605704b6d..39f2491232 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,7 +96,9 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
-    WorkDirectory* directory;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
     GraphWorkload* graphWorkload;
 
     Addr lastAtomAddr;
@@ -114,8 +116,9 @@ class CoalesceEngine : public BaseMemoryEngine
 
     // Tracking work in cache
     int pullsReceived;
-    // NOTE: Remember to erase from this upon eviction from cache
-    UniqueFIFO<int> activeCacheBlocks;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> currentActiveCacheBlocks;
+    UniqueFIFO<int> futureActiveCacheBlocks;
 
     int pullsScheduled;
     int pendingPullLimit;
@@ -195,12 +198,14 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
 
+    virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
-
-    void createPopCountDirectory(int atoms_per_block);
+    void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index de5d569c18..83f3033427 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -39,7 +39,6 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_WB"
 };
 
-
 const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
 {
     "ACCEPT",
@@ -53,4 +52,18 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] =
     "READ_FOR_PUSH"
 };
 
+const char* processingModeStrings[NUM_PROCESSING_MODE] =
+{
+    "NOT_SET",
+    "ASYNCHRONOUS",
+    "BULK_SYNCHRONOUS"
+};
+
+const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
+{
+    "NOT_SET",
+    "CONSUMING",
+    "APPLYING"
+};
+
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 6153386b71..f6d199bf7d 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -60,6 +60,24 @@ enum ReadDestination
 };
 extern const char* readDestinationStrings[NUM_READ_DESTINATION];
 
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+enum BulkSynchronousStates
+{
+    NOT_SET,
+    CONSUMING,
+    APPLYING,
+    NUM_BULK_SYNCHRONOUS_STATE,
+}
+extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
+
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ad18a0d5a5..358394ffc5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -63,6 +63,10 @@ class MPU : public SimObject
     MPU(const Params& params);
     void registerCenteralController(CenteralController* centeral_controller);
 
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }

From 7ce7ef9fb23c9d1b4584dae63ec7db8c7ffccd8b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:12:57 -0800
Subject: [PATCH 224/279] Fixing enums

---
 src/accl/graph/sega/centeral_controller.cc | 5 ++++-
 src/accl/graph/sega/enums.hh               | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6c924a4703..6e5f3ffcec 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -113,6 +113,9 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        state = BulkSynchronousStates::CONSUMING;
+    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -149,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
+        assert(state != BulkSynchronousState::NOT_SET);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index f6d199bf7d..8280f122c3 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -75,7 +75,7 @@ enum BulkSynchronousStates
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,
-}
+};
 extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
 
 } // namespace gem5

From 27ee07a4fed6ade8c3114faac85171c9afcc52eb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:15:52 -0800
Subject: [PATCH 225/279] Further fixes for enums.

---
 src/accl/graph/sega/centeral_controller.cc | 4 ++--
 src/accl/graph/sega/enums.cc               | 2 +-
 src/accl/graph/sega/enums.hh               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6e5f3ffcec..c6b9cf7a52 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -44,7 +44,7 @@ CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
     mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::NOT_SET)
+    state(BulkSynchronousState::DONT_CARE)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -152,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::NOT_SET);
+        assert(state != BulkSynchronousState::DONT_CARE);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 83f3033427..099594e9eb 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -61,7 +61,7 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
 
 const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
 {
-    "NOT_SET",
+    "DONT_CARE",
     "CONSUMING",
     "APPLYING"
 };
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 8280f122c3..4c94412c9b 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -71,7 +71,7 @@ extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
 enum BulkSynchronousStates
 {
-    NOT_SET,
+    DONT_CARE,
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,

From 97ff4738149ddcb9b8b2634897b06ee8f35f6caf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:21:12 -0800
Subject: [PATCH 226/279] Fixing typos

---
 src/accl/graph/sega/enums.hh | 2 +-
 src/accl/graph/sega/mpu.hh   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 4c94412c9b..969ee8a976 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,7 +69,7 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousStates
+enum BulkSynchronousState
 {
     DONT_CARE,
     CONSUMING,
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 358394ffc5..7d75e3e0b7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -64,8 +64,8 @@ class MPU : public SimObject
     void registerCenteralController(CenteralController* centeral_controller);
 
     void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
-    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
-    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }

From 54a0df243214cb9a711d579ab4d6ef18151ffcc3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:31:55 -0800
Subject: [PATCH 227/279] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6b9cf7a52..df1abbedc3 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -114,7 +114,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousStates::CONSUMING;
+        state = BulkSynchronousState::CONSUMING;
     }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index bfe3fe21b8..6efafbb76c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -124,7 +124,7 @@ CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
 {
     currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
-    futureDirectory = new PopCountDirectroy(
+    futureDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
 }
 
@@ -390,7 +390,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             bool atom_active_now = false;
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
@@ -453,12 +453,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= items[index].activeNow;
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
                 // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->deactivate(addr);
+                }
                 activeBuffer.emplace_back(pkt, curTick());
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
@@ -573,7 +578,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (mode == ProcessingMode::ASYNCHRONOUS) {
-        cacheBlocks[block_index].activeNow |= active;
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!currentActiveCacheBlocks.find(block_index))) {
             currentActiveCacheBlocks.push_back(block_index);
             if (!owner->running()) {
@@ -582,7 +587,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
     }
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        cacheBlocks[block_index].activeFuture |= active;
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
         if (active && (!futureActiveCacheBlocks.find(block_index))) {
             futureActiveCacheBlocks.push_back(block_index);
         }
@@ -903,7 +908,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
             WorkListItem items[numElementsPerLine];
             wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_future = false;
-            for (int index = 0; index < numElementPerLine; index++) {
+            for (int index = 0; index < numElementsPerLine; index++) {
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_future) {
@@ -967,7 +972,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -1031,7 +1036,7 @@ CoalesceEngine::processNextApplyEvent()
             }
             delete pkt;
         }
-    } else if (!activeCacheBlocks.empty()) {
+    } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
         int initial_fifo_length = activeCacheBlocks.size();
         while (true) {

From 65e203dd4923781c5989e66674be66ce47c87ef3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:41:42 -0800
Subject: [PATCH 228/279] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index df1abbedc3..db0f7941ed 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -68,7 +68,7 @@ void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
     fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
-                        "mode by calling either setAsyncMode or setBSPMode.")
+                        "mode by calling either setAsyncMode or setBSPMode.");
     if (mode == ProcessingMode::ASYNCHRONOUS) {
         for (auto mpu: mpuVector) {
             mpu->createAsyncPopCountDirectory(atoms_per_block);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6efafbb76c..e3c194566a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,9 +1038,9 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = activeCacheBlocks.size();
+        int initial_fifo_length = crrentActiveCacheBlocks.size();
         while (true) {
-            int block_index = activeCacheBlocks.front();
+            int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
                     if (cacheBlocks[block_index].items[index].activeNow) {

From 23816803e8e289bb2cc08f82df0b43efcc83d529 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:54:47 -0800
Subject: [PATCH 229/279] Debug.

---
 src/accl/graph/base/graph_workload.cc      | 74 +++++++++++++++++++++-
 src/accl/graph/base/graph_workload.hh      | 36 +++++------
 src/accl/graph/sega/centeral_controller.cc | 10 +--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  2 +-
 5 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e36c074da9..a78b3c1526 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -92,9 +92,9 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::activeCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
 }
 
 uint32_t
@@ -298,6 +298,76 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
+// void
+// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//     WorkListItem items[num_elements];
+
+//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+//     bool atom_active = false;
+//     for (int i = 0; i < num_elements; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         atom_active |= activeCondition(items[i]);
+//     }
+//     if (atom_active) {
+//         dir->activate(pkt->getAddr());
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = writeToFloat<uint32_t>(weight);
+//     if (weight == 0) {
+//         weight_float = 1.0;
+//     }
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::activeCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return (dist >= threshold) && (wl.degree > 0);
+// }
+
+// uint32_t
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float delta = (temp_float - prop_float) / wl.degree;
+//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//     wl.prop = wl.tempProp;
+//     return delta_uint;
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, prop_float, wl.degree, wl.edgeIndex);
+// }
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 14a6561ae3..8e27d16bf9 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -72,7 +72,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
@@ -119,26 +119,26 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-  public:
-    PRWorkload(float alpha, float threshold):
-        alpha(alpha), threshold(threshold)
-    {}
+//   public:
+//     PRWorkload(float alpha, float threshold):
+//         alpha(alpha), threshold(threshold)
+//     {}
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index db0f7941ed..7de6f61b56 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -58,11 +58,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold);
+// }
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab0e0c0c09..b32dc38385 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -71,7 +71,7 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e3c194566a..6b44f7395b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,7 +1038,7 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = crrentActiveCacheBlocks.size();
+        int initial_fifo_length = currentActiveCacheBlocks.size();
         while (true) {
             int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {

From a2f76f6842c958124218f33c410bf0990c68296b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:06:44 -0800
Subject: [PATCH 230/279] Debugging.

---
 src/accl/graph/base/graph_workload.cc     | 8 +++++---
 src/accl/graph/sega/CenteralController.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index a78b3c1526..50024965a1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -67,12 +67,14 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem items[num_elements];
 
         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
             dir->activate(aligned_addr);
         }
+        items[index] = new_wl;
+
         pkt->deleteData();
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 8b43c90102..6de9e03a1c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,6 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]

From 991d2dbe91f414f250f16ff6b13d159b42cfad88 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:08:17 -0800
Subject: [PATCH 231/279] Typos.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 50024965a1..9c21a3932a 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -96,7 +96,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 bool
 BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
 }
 
 uint32_t

From 90641ab20e71b70436d3f4c9315ab02c8d5ae2e2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:14:41 -0800
Subject: [PATCH 232/279] Debugging.

---
 src/accl/graph/base/graph_workload.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9c21a3932a..8536c2bbd8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -71,6 +71,7 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = initValue;
         if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
             dir->activate(aligned_addr);
         }
         items[index] = new_wl;

From f94a1db9834763966994a6cd882d88424e7be468 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 00:58:54 -0800
Subject: [PATCH 233/279] Finalizing bsp and pr.

---
 configs/accl/bfs.py                        |   3 +-
 configs/accl/pr.py                         |  28 +++--
 configs/accl/sega.py                       |  22 ++--
 configs/accl/sega_simple.py                |  21 ++--
 src/accl/graph/base/graph_workload.cc      | 131 ++++++---------------
 src/accl/graph/base/graph_workload.hh      |  34 +++---
 src/accl/graph/sega/CenteralController.py  |   3 +-
 src/accl/graph/sega/centeral_controller.cc |  46 ++++----
 src/accl/graph/sega/centeral_controller.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc     |  63 ++++++++++
 src/accl/graph/sega/coalesce_engine.hh     |   2 +
 src/accl/graph/sega/enums.cc               |   7 --
 src/accl/graph/sega/enums.hh               |   9 --
 src/accl/graph/sega/mpu.hh                 |   2 +
 14 files changed, 193 insertions(+), 182 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 806aa8a915..ab5de485b1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -88,7 +88,7 @@ def get_inputs():
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,6 +98,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_async_mode()
     system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
     if sample:
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e3d7c764ad..ea8a103640 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -35,9 +35,9 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -69,8 +69,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
+        args.iterations,
         args.alpha,
-        args.threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -82,13 +82,13 @@ def get_inputs():
         num_gpts,
         cache_size,
         graph,
+        iterations,
         alpha,
-        threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,8 +98,9 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha, threshold)
+    system.create_pr_workload(alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -112,11 +113,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
     if verify:
         system.print_answer()
-
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1ea36ea49e..07e1b36d9d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -73,8 +73,8 @@ def __init__(
         )
 
         self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=
+            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+            gpt = GPT("16GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
@@ -139,15 +139,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index f59fa71a79..8727a4c90d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -66,9 +66,9 @@ def __init__(
             max_propagates_per_cycle=8,
             update_queue_size=32,
         )
-        
+
         self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-        
+
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
                 range=AddrRange(edge_memory_size), in_addr_map=False)
@@ -129,16 +129,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
-
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 8536c2bbd8..1fa2b287c4 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -111,9 +111,11 @@ std::string
 BFSWorkload::printWorkListItem(const WorkListItem wl)
 {
     return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
 }
 
 void
@@ -232,7 +234,7 @@ SSSPWorkload::printWorkListItem(const WorkListItem wl)
 
 
 void
-PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
     size_t pkt_size = pkt->getSize();
     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
@@ -241,9 +243,12 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(items[i]);
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
@@ -254,7 +259,7 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 }
 
 uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
 {
     float update_float = writeToFloat<uint32_t>(update);
     float value_float = writeToFloat<uint32_t>(value);
@@ -262,115 +267,47 @@ PRWorkload::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
-    if (weight == 0) {
-        weight_float = 1.0;
-    }
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+    return readFromFloat<uint32_t>(alpha * value_float);
 }
 
 bool
-PRWorkload::activeCondition(WorkListItem wl)
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return (dist >= threshold) && (wl.degree > 0);
+    return (old_wl.degree > 0);
 }
 
 uint32_t
-PRWorkload::apply(WorkListItem& wl)
+BSPPRWorkload::apply(WorkListItem& wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = (temp_float - prop_float) / wl.degree;
+    float delta = prop_float / wl.degree;
     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-    wl.prop = wl.tempProp;
     return delta_uint;
 }
 
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.activeFuture = (wl.degree > 0);
+}
+
 std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, prop_float, wl.degree, wl.edgeIndex);
-}
-// void
-// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//     WorkListItem items[num_elements];
-
-//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-//     bool atom_active = false;
-//     for (int i = 0; i < num_elements; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         atom_active |= activeCondition(items[i]);
-//     }
-//     if (atom_active) {
-//         dir->activate(pkt->getAddr());
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = writeToFloat<uint32_t>(weight);
-//     if (weight == 0) {
-//         weight_float = 1.0;
-//     }
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::activeCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return (dist >= threshold) && (wl.degree > 0);
-// }
-
-// uint32_t
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float delta = (temp_float - prop_float) / wl.degree;
-//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//     wl.prop = wl.tempProp;
-//     return delta_uint;
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, prop_float, wl.degree, wl.edgeIndex);
-// }
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 8e27d16bf9..fdd4928e10 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
@@ -72,6 +73,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
@@ -119,26 +121,24 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
 
-//   public:
-//     PRWorkload(float alpha, float threshold):
-//         alpha(alpha), threshold(threshold)
-//     {}
+  public:
+    BSPPRWorkload(float alpha): alpha(alpha) {}
 
-//     ~PRWorkload() {}
+    ~BSPPRWorkload() {}
 
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6de9e03a1c..9dd8f41e61 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
+                    PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7de6f61b56..0103b1a0c4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,8 +43,7 @@ namespace gem5
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
-    mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::DONT_CARE)
+    mode(ProcessingMode::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -58,11 +57,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold);
-// }
+void
+CenteralController::createPRWorkload(float alpha)
+{
+    workload = new BSPPRWorkload(alpha);
+}
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
@@ -113,9 +112,6 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
-    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousState::CONSUMING;
-    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -152,20 +148,25 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
-        if (state == BulkSynchronousState::APPLYING) {
-            // TODO:
-            // 1- Toggle directories
-            // 2- Check if termination condition is met
-            // 3- If yes, schedule exit event,
-            // 4- If not switch state to consuming.
-            exitSimLoopNow("applying done.");
-        } else if (state == BulkSynchronousState::CONSUMING) {
-            // TODO:
-            // Schedule Bulk apply
-            exitSimLoopNow("consuming done.");
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
         }
+        exitSimLoopNow("finished an iteration.");
+    }
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
     }
+    return work_count;
 }
 
 void
@@ -184,7 +185,6 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index b32dc38385..ab039e5024 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -50,7 +50,6 @@ class CenteralController : public ClockedObject
     Addr maxVertexAddr;
 
     ProcessingMode mode;
-    BulkSynchronousState state;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
@@ -71,10 +70,11 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha);
 
     void recvDoneSignal();
 
+    int workCount();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6b44f7395b..32b946d29f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -111,6 +111,69 @@ CoalesceEngine::postMemInitSetup()
     currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
+void
+CoalesceEngine::postConsumeProcess()
+{
+    WorkListItem items[numElementsPerLine];
+    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                // if (cacheBlocks[block_index].items[index].activeFuture) {
+                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                //     cacheBlocks[block_index].items[index].activeNow = true;
+                //     cacheBlocks[block_index].items[index].activeFuture = false;
+                // }
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            delete read_pkt;
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete write_pkt;
+        }
+    }
+}
+
 void
 CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 39f2491232..c9d8e47f15 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -205,12 +205,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
+    void postConsumeProcess();
     void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     int workCount();
+    int futureWorkCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 099594e9eb..f7ef96197f 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -59,11 +59,4 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
     "BULK_SYNCHRONOUS"
 };
 
-const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
-{
-    "DONT_CARE",
-    "CONSUMING",
-    "APPLYING"
-};
-
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 969ee8a976..f97c33a0e0 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,15 +69,6 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousState
-{
-    DONT_CARE,
-    CONSUMING,
-    APPLYING,
-    NUM_BULK_SYNCHRONOUS_STATE,
-};
-extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
-
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 7d75e3e0b7..04393db36d 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -70,6 +70,8 @@ class MPU : public SimObject
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
 

From 1bbb4c777525f9255cce9629ee6a14ec11c163b7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 11:17:39 -0800
Subject: [PATCH 234/279] Fixing a bug in async mode.

---
 configs/accl/sega.py                       |  2 +-
 configs/accl/sega_simple.py                |  2 +-
 src/accl/graph/sega/CenteralController.py  |  3 ++-
 src/accl/graph/sega/centeral_controller.cc | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc     |  6 +++---
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 07e1b36d9d..b5ce618f7f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("16GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 8727a4c90d..ff97134b47 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -117,7 +117,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("4GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9dd8f41e61..f9544ec539 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -37,7 +37,8 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
 
-    image_file = Param.String("Path to the vertex image file.")
+    vertex_image_file = Param.String("Path to the vertex image file.")
+    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0103b1a0c4..c44789f9f0 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -89,7 +89,7 @@ CenteralController::startup()
         mpu->recvWorkload(workload);
     }
 
-    const auto& file = params().image_file;
+    const auto& vertex_file = params().vertex_image_file;
     if (file == "")
         return;
 
@@ -97,10 +97,10 @@ CenteralController::startup()
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
-    loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
 
-    PortProxy proxy(
+    PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
@@ -110,7 +110,7 @@ CenteralController::startup()
         }
     }, system->cacheLineSize());
 
-    panic_if(!image.write(proxy), "%s: Unable to write image.");
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 32b946d29f..35b2bf71cf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -744,8 +744,6 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    // FIXME: done() might have a different meaning depending on
-    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -934,7 +932,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                }
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);

From 8ea1b021bf8fc5c3109000a29f7d60e00378feb7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 16:03:25 -0800
Subject: [PATCH 235/279] Debugging and removing typos. sega-ddr represent
 correct system config.

---
 configs/accl/sega-ddr/bfs.py               | 125 +++++++++++++
 configs/accl/sega-ddr/pr.py                | 128 +++++++++++++
 configs/accl/sega-ddr/sega.py              | 200 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 -
 src/accl/graph/sega/centeral_controller.cc |   6 +-
 src/accl/graph/sega/coalesce_engine.cc     |  68 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 7 files changed, 505 insertions(+), 32 deletions(-)
 create mode 100644 configs/accl/sega-ddr/bfs.py
 create mode 100644 configs/accl/sega-ddr/pr.py
 create mode 100644 configs/accl/sega-ddr/sega.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
new file mode 100644
index 0000000000..8766822b33
--- /dev/null
+++ b/configs/accl/sega-ddr/bfs.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/sega-ddr/pr.py
new file mode 100644
index 0000000000..ea8a103640
--- /dev/null
+++ b/configs/accl/sega-ddr/pr.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        iterations,
+        alpha,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(alpha)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
new file mode 100644
index 0000000000..c5545ee0f1
--- /dev/null
+++ b/configs/accl/sega-ddr/sega.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f9544ec539..bda2fa3d6a 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -38,7 +38,6 @@ class CenteralController(ClockedObject):
     system = Param.System(Parent.any, "System this Engine is a part of")
 
     vertex_image_file = Param.String("Path to the vertex image file.")
-    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c44789f9f0..26e4473b03 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -90,11 +90,11 @@ CenteralController::startup()
     }
 
     const auto& vertex_file = params().vertex_image_file;
-    if (file == "")
+    if (vertex_file == "")
         return;
 
-    auto* object = loader::createObjectFile(file, true);
-    fatal_if(!object, "%s: Could not load %s.", name(), file);
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage vertex_image = object->buildImage();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 35b2bf71cf..263e08d901 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -343,12 +343,14 @@ CoalesceEngine::recvWLRead(Addr addr)
                     if (atom_active_now) {
                         currentActiveCacheBlocks.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                        // stats.blockActiveCount.sample(count);
-                        // stats.frontierSize.sample(directory->workCount());
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.currentBlockActiveCount.sample(count);
                     }
                     if (atom_active_future) {
                         futureActiveCacheBlocks.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.futureBlockActiveCount.sample(count);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -457,14 +459,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
                 currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
             }
             if (atom_active_future) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = futureDirectory->deactivate(addr);
                 futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -522,15 +526,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->deactivate(addr);
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 activeBuffer.emplace_back(pkt, curTick());
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
             } else {
+                stats.wastefulBytesRead += pkt->getSize();
                 delete pkt;
             }
 
@@ -686,15 +692,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
                 if (atom_active_now) {
-                    // TODO: Sample frontier size and blockCount here.
                     currentActiveCacheBlocks.erase(block_index);
                     int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                    // stats.blockActiveCount.sample(count);
-                    // stats.frontierSize.sample(directory->workCount());
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
                 }
                 if (atom_active_future) {
                     futureActiveCacheBlocks.erase(block_index);
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -932,17 +939,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
             if (atom_active_future) {
                 int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
@@ -956,7 +967,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1141,8 +1151,8 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Could not find "
-                        "work to apply.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
     }
 
     if (pullCondition()) {
@@ -1184,6 +1194,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1192,8 +1204,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1202,10 +1214,14 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(frontierSize, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
-             "Histogram of the popCount values in the directory"),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1225,8 +1241,10 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    frontierSize.init(64);
-    blockActiveCount.init(64);
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9d8e47f15..8ee17781fc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -171,18 +171,21 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidWriteBacks;
+        statistics::Scalar worklessCycles;
 
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram frontierSize;
-        statistics::Histogram blockActiveCount;
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram currentBlockActiveCount;
+        statistics::Histogram futureBlockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };

From 442c1064043aceca4edb6303e82cb365efbbae69 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 22:51:41 -0800
Subject: [PATCH 236/279] Debugging, finalizing the config and merging new
 workloads.

---
 configs/accl/sega-ddr/bfs.py               |  15 +-
 configs/accl/sega-ddr/cc.py                | 119 +++++++++++
 configs/accl/sega-ddr/sega.py              |  15 +-
 configs/accl/sega-ddr/sssp.py              | 125 +++++++++++
 src/accl/graph/base/graph_workload.cc      | 233 +++++++--------------
 src/accl/graph/base/graph_workload.hh      |  81 ++++---
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/centeral_controller.cc |  18 ++
 src/accl/graph/sega/centeral_controller.hh |   3 +
 src/accl/graph/sega/push_engine.cc         |   1 -
 10 files changed, 408 insertions(+), 205 deletions(-)
 create mode 100644 configs/accl/sega-ddr/cc.py
 create mode 100644 configs/accl/sega-ddr/sssp.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
index 8766822b33..97f1b5dc21 100644
--- a/configs/accl/sega-ddr/bfs.py
+++ b/configs/accl/sega-ddr/bfs.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +81,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -87,6 +96,7 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -103,7 +113,10 @@ def get_inputs():
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/sega-ddr/cc.py
new file mode 100644
index 0000000000..9b6d2b587d
--- /dev/null
+++ b/configs/accl/sega-ddr/cc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
index c5545ee0f1..8325cf7565 100644
--- a/configs/accl/sega-ddr/sega.py
+++ b/configs/accl/sega-ddr/sega.py
@@ -56,8 +56,8 @@ def __init__(self, register_file_size: int, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -121,7 +121,7 @@ def __init__(self, size: str):
             dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
         )
         self.xbar = NoncoherentXBar(
-            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
         )
         self.xbar.mem_side_ports = self.mem_ctrl.port
 
@@ -193,6 +193,15 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sega-ddr/sssp.py
new file mode 100644
index 0000000000..f2e60b856a
--- /dev/null
+++ b/configs/accl/sega-ddr/sssp.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 1fa2b287c4..7471e4d073 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,90 +118,95 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
-        }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+// void
+// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+//     if (pkt->getAddr() == aligned_addr) {
+//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//         WorkListItem items[num_elements];
+
+//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+//         items[index].tempProp = initValue;
+//         if (activeCondition(items[index])) {
+//             dir->activate(aligned_addr);
+//         }
+//         pkt->deleteData();
+//         pkt->allocate();
+//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+//     }
+// }
+
+// uint32_t
+// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     return std::min(update, value);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     return 1;
+// }
+
+// bool
+// BFSVisitedWorkload::activeCondition(WorkListItem wl)
+// {
+//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::apply(WorkListItem& wl)
+// {
+//     wl.prop = wl.tempProp;
+//     return wl.prop;
+// }
+
+// std::string
+// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     return csprintf(
+//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+//             );
+// }
 
 uint32_t
-BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return 1;
-}
-
-bool
-BFSVisitedWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-BFSVisitedWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
 }
 
 void
-SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
+    Addr pkt_addr = pkt->getAddr();
     size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
 
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        bool vertex_active = activeCondition(new_wl, items[i]);
+        if (vertex_active) {
+            new_wl.activeNow = true;
         }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+        items[i] = new_wl;
+        atom_active |= vertex_active;
 
-uint32_t
-SSSPWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
 }
 
 uint32_t
@@ -210,29 +215,6 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
-bool
-SSSPWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-SSSPWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-SSSPWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
-
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -309,61 +291,4 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    Addr pkt_addr = pkt->getAddr();
-    size_t pkt_size = pkt->getSize();
-    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-    WorkListItem items[num_elements];
-
-    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-    bool atom_active = false;
-    for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
-        items[i].prop = -1;
-        atom_active |= activeCondition(items[i]);
-    }
-    if (atom_active) {
-        dir->activate(pkt->getAddr());
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-}
-
-uint32_t
-CCWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-CCWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return value;
-}
-
-bool
-CCWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-CCWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-CCWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fdd4928e10..fa722a634e 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -78,49 +78,31 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class BFSVisitedWorkload : public GraphWorkload
+class BFSVisitedWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
 
-    ~BFSVisitedWorkload() {}
-
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class SSSPWorkload : public GraphWorkload
+class SSSPWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
-
-    ~SSSPWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
-
 class BSPPRWorkload : public GraphWorkload
 {
   private:
@@ -140,21 +122,28 @@ class BSPPRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class CCWorkload : public GraphWorkload
-{
-
-  public:
-    CCWorkload() {}
-
-    ~CCWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+// class BSPBCWorkload : public GraphWorkload
+// {
+//   private:
+//     int currentDepth;
+//     Addr initAddr;
+//     uint32_t initValue;
+
+//   public:
+//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
+//         currentDepth(1), initAddr(init_addr), initValue(init_value)
+//     {}
+
+//     ~BSPBCWorkload() {}
+
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual void interIterationInit(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bda2fa3d6a..f3210a8ec3 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,9 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
+                    PyBindMethod("createBFSVisitedWorkload"),
+                    PyBindMethod("createSSSPWorkload"),
+                    PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 26e4473b03..8414aee259 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -57,6 +57,24 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab039e5024..aa3938353d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -70,6 +70,9 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
     void createPRWorkload(float alpha);
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 09f29a43e4..a8c9a1bcb1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -411,7 +411,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) 1) << 2);
 
-    // FIXME: MemCmd::UpdateWL
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();

From 5b132e228342a1535bdecfa9c909b30caee539df Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:15:59 -0800
Subject: [PATCH 237/279] Fixing port proxy bug of limiting size to int.

---
 src/accl/graph/base/graph_workload.cc      |  8 ++------
 src/accl/graph/sega/centeral_controller.cc | 12 +++++++-----
 src/accl/graph/sega/mpu.hh                 |  1 +
 src/mem/port_proxy.cc                      |  6 +++---
 src/mem/port_proxy.hh                      | 18 +++++++++---------
 src/mem/translating_port_proxy.cc          |  6 +++---
 src/mem/translating_port_proxy.hh          |  6 +++---
 7 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7471e4d073..38f11778b6 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -193,13 +193,9 @@ CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
         new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
-        bool vertex_active = activeCondition(new_wl, items[i]);
-        if (vertex_active) {
-            new_wl.activeNow = true;
-        }
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
         items[i] = new_wl;
-        atom_active |= vertex_active;
-
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 8414aee259..970a0572c5 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -101,6 +101,7 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
 void
 CenteralController::startup()
 {
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->setProcessingMode(mode);
@@ -126,7 +127,7 @@ CenteralController::startup()
                 mpu->recvFunctional(pkt);
             }
         }
-    }, system->cacheLineSize());
+    }, vertex_atom);
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
@@ -190,18 +191,19 @@ CenteralController::workCount()
 void
 CenteralController::printAnswerToHostSimout()
 {
-    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
     WorkListItem items[num_items];
-    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
     {
-        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
             if (contains(range_list, addr)) {
                 mpu->recvFunctional(pkt);
             }
         }
-        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
         for (int i = 0; i < num_items; i++) {
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 04393db36d..95d3adeca5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -67,6 +67,7 @@ class MPU : public SimObject
     void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
     void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5

From e554060e9de004390694cd4c80eb7a0ab4a3ffbb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:31:29 -0800
Subject: [PATCH 238/279] Fixing postConsumeProcess.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 263e08d901..4fa400a63a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -115,7 +115,9 @@ void
 CoalesceEngine::postConsumeProcess()
 {
     WorkListItem items[numElementsPerLine];
-    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
         int block_index = getBlockIndex(addr);
         if (cacheBlocks[block_index].addr == addr) {
             assert(cacheBlocks[block_index].valid);
@@ -125,11 +127,6 @@ CoalesceEngine::postConsumeProcess()
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
                 assert(!cacheBlocks[block_index].items[index].activeNow);
-                // if (cacheBlocks[block_index].items[index].activeFuture) {
-                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
-                //     cacheBlocks[block_index].items[index].activeNow = true;
-                //     cacheBlocks[block_index].items[index].activeFuture = false;
-                // }
                 atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
                 graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
                 atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;

From d1742f7c07ac64bb05b8fbec109643d32da74857 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 18:18:28 -0800
Subject: [PATCH 239/279] Addding BC.

---
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++++++--------
 src/accl/graph/base/graph_workload.hh      |  52 ++++---
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 3 files changed, 140 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 38f11778b6..6ac2018629 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,63 +118,6 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-// void
-// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-//     if (pkt->getAddr() == aligned_addr) {
-//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//         WorkListItem items[num_elements];
-
-//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-//         items[index].tempProp = initValue;
-//         if (activeCondition(items[index])) {
-//             dir->activate(aligned_addr);
-//         }
-//         pkt->deleteData();
-//         pkt->allocate();
-//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-//     }
-// }
-
-// uint32_t
-// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     return std::min(update, value);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     return 1;
-// }
-
-// bool
-// BFSVisitedWorkload::activeCondition(WorkListItem wl)
-// {
-//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::apply(WorkListItem& wl)
-// {
-//     wl.prop = wl.tempProp;
-//     return wl.prop;
-// }
-
-// std::string
-// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     return csprintf(
-//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-//             );
-// }
-
 uint32_t
 BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
     return value;
@@ -287,4 +230,104 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= (4294967295U >> 8);
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    assert(update_depth == (currentDepth - 1));
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = update_depth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= (4294967295U >> 8);
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fa722a634e..4ed3dcf3ac 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void iterate() = 0;
     virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
@@ -73,6 +74,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
@@ -117,33 +119,39 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-// class BSPBCWorkload : public GraphWorkload
-// {
-//   private:
-//     int currentDepth;
-//     Addr initAddr;
-//     uint32_t initValue;
-
-//   public:
-//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-//         currentDepth(1), initAddr(init_addr), initValue(init_value)
-//     {}
-
-//     ~BSPBCWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual void interIterationInit(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        currentDepth(0), initAddr(init_addr), initValue(init_value),
+        depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 970a0572c5..15062f1465 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -131,6 +131,11 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
+    // IDEA: Should this be here or after calling start?
+    // Point of iterate here is to set global variables.
+    // At this point, we know that vertex memory has been
+    // initialized and we can initialize global variables.
+    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -170,6 +175,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
+            // IDEA: Should this be here or after calling start?
+            // Point of iterate here is to update global variables.
+            // At this point, we know that vertex memory has been
+            // updated and we can update global variables.
+            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }

From 778c75bf17f8a48d4ea4975193c45d6ed3f93b63 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 20:58:16 -0800
Subject: [PATCH 240/279] Adding BC and degbugging.

---
 configs/accl/{sega-ddr/pr.py => bc.py}     |  18 +-
 configs/accl/bfs.py                        |  20 +-
 configs/accl/{sega-ddr => }/cc.py          |   0
 configs/accl/sega-ddr/bfs.py               | 138 --------------
 configs/accl/sega-ddr/sega.py              | 209 ---------------------
 configs/accl/sega.py                       |  98 +++++++---
 configs/accl/sega_simple.py                |  96 +++++++---
 configs/accl/{sega-ddr => }/sssp.py        |   0
 src/accl/graph/base/graph_workload.cc      |   9 +-
 src/accl/graph/base/graph_workload.hh      |   4 +-
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  18 +-
 src/accl/graph/sega/centeral_controller.hh |   1 +
 13 files changed, 195 insertions(+), 417 deletions(-)
 rename configs/accl/{sega-ddr/pr.py => bc.py} (90%)
 rename configs/accl/{sega-ddr => }/cc.py (100%)
 delete mode 100644 configs/accl/sega-ddr/bfs.py
 delete mode 100644 configs/accl/sega-ddr/sega.py
 rename configs/accl/{sega-ddr => }/sssp.py (100%)

diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/bc.py
similarity index 90%
rename from configs/accl/sega-ddr/pr.py
rename to configs/accl/bc.py
index ea8a103640..074bee73b9 100644
--- a/configs/accl/sega-ddr/pr.py
+++ b/configs/accl/bc.py
@@ -34,10 +34,12 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +69,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
-        args.alpha,
+        args.init_addr,
+        args.init_value,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +84,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
-        alpha,
+        init_addr,
+        init_value,
         simple,
         sample,
         verify,
@@ -93,14 +99,14 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_bc_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index ab5de485b1..97f1b5dc21 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -34,10 +34,19 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +76,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +91,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -93,14 +106,17 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/cc.py
similarity index 100%
rename from configs/accl/sega-ddr/cc.py
rename to configs/accl/cc.py
diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
deleted file mode 100644
index 97f1b5dc21..0000000000
--- a/configs/accl/sega-ddr/bfs.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument(
-        "--visited",
-        dest="visited",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use visitation version of BFS",
-    )
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 100us",
-    )
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.num_registers,
-        args.cache_size,
-        args.graph,
-        args.init_addr,
-        args.init_value,
-        args.visited,
-        args.simple,
-        args.sample,
-        args.verify,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        num_registers,
-        cache_size,
-        graph,
-        init_addr,
-        init_value,
-        visited,
-        simple,
-        sample,
-        verify,
-    ) = get_inputs()
-
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.set_async_mode()
-    system.create_pop_count_directory(64)
-    if visited:
-        system.create_bfs_visited_workload(init_addr, init_value)
-    else:
-        system.create_bfs_workload(init_addr, init_value)
-    if sample:
-        while True:
-            exit_event = m5.simulate(100000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
deleted file mode 100644
index 8325cf7565..0000000000
--- a/configs/accl/sega-ddr/sega.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from math import log
-from m5.objects import *
-
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-    intlv_low_bit = log(cache_line_size, 2)
-    intlv_bits = log(num_channels, 2)
-    ret = []
-    for i in range(num_channels):
-        ret.append(
-            AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i,
-            )
-        )
-    return ret, intlv_low_bit + intlv_bits - 1
-
-
-class GPT(SubSystem):
-    def __init__(self, register_file_size: int, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
-        )
-        self.coalesce_engine = CoalesceEngine(
-            attached_memory_atom_size=32,
-            cache_size=cache_size,
-            max_resp_per_cycle=8,
-            pending_pull_limit=64,
-            active_buffer_size=80,
-            post_push_wb_queue_size=64,
-        )
-        self.push_engine = PushEngine(
-            push_req_queue_size=32,
-            attached_memory_atom_size=64,
-            resp_queue_size=4096,
-            max_propagates_per_cycle=8,
-            update_queue_size=32,
-        )
-
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-            dram_2=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-        )
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-
-        self.mpu = MPU(
-            wl_engine=self.wl_engine,
-            coalesce_engine=self.coalesce_engine,
-            push_engine=self.push_engine,
-        )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-
-
-class EdgeMemory(SubSystem):
-    def __init__(self, size: str):
-        super(EdgeMemory, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2.4GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-
-        self.mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
-        )
-        self.xbar = NoncoherentXBar(
-            width=64, frontend_latency=1, forward_latency=1, response_latency=1
-        )
-        self.xbar.mem_side_ports = self.mem_ctrl.port
-
-    def set_image(self, image):
-        self.mem_ctrl.dram.image_file = image
-
-    def getPort(self):
-        return self.xbar.cpu_side_ports
-
-    def setPort(self, port):
-        self.xbar.cpu_side_ports = port
-
-class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
-        assert num_gpts != 0
-        assert num_gpts % 2 == 0
-        assert (num_gpts & (num_gpts - 1)) == 0
-
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        # Building the CenteralController
-        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
-        # Building the EdgeMemories
-        edge_mem = []
-        for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
-            mem.set_image(f"{graph_path}/edgelist_{i}")
-            edge_mem.append(mem)
-        self.edge_mem = edge_mem
-        # Building the GPTs
-        vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
-        )
-        gpts = []
-        for i in range(num_gpts):
-            gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def work_count(self):
-        return self.ctrl.workCount()
-
-    def set_async_mode(self):
-        self.ctrl.setAsyncMode()
-
-    def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
-
-    def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
-
-    def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
-
-    def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
-
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b5ce618f7f..32124731d6 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -68,16 +69,14 @@ def __init__(
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-        )
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=
-            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -97,6 +96,12 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_ranges):
         self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
         self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
@@ -104,32 +109,65 @@ def set_vertex_range(self, vertex_ranges):
     def set_vertex_pch_bit(self, pch_bit):
         self.vertex_mem_ctrl.pch_bit = pch_bit
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
             )
             gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -154,8 +192,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff97134b47..ff567b57e3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -67,14 +68,10 @@ def __init__(
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -94,32 +91,77 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), num_mpus, 32
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -144,8 +186,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sssp.py
similarity index 100%
rename from configs/accl/sega-ddr/sssp.py
rename to configs/accl/sssp.py
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6ac2018629..7bcd447b8e 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -245,7 +245,7 @@ BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         uint32_t prop = 0;
         prop |= initValue;
         // NOTE: Depth of the initial vertex is 0.
-        prop &= (4294967295U >> 8);
+        prop &= countMask;
         new_wl.tempProp = prop;
         new_wl.prop = prop;
         if (activeCondition(new_wl, items[index])) {
@@ -265,11 +265,10 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
 {
     uint32_t update_depth = (update & depthMask) >> 24;
     uint32_t update_count = (update & countMask);
-    assert(update_depth == (currentDepth - 1));
     uint32_t value_depth = (value & depthMask) >> 24;
     uint32_t value_count = (value & countMask);
     if (value_depth == 255) {
-        value_depth = update_depth;
+        value_depth = currentDepth;
         value_count = 0;
     }
     if (value_depth == currentDepth) {
@@ -283,7 +282,7 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
                                 " Therefore, performane metrics could be used.");
     // HACK: Make sure to always set the depth correctly even if count
     // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
-    ret &= (4294967295U >> 8);
+    ret &= countMask;
     // NOTE: Now that the depth is securely reset we can copy the correct value.
     ret |= (value_depth << 24);
     return ret;
@@ -311,7 +310,7 @@ bool
 BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
     uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
-    return (depth == currentDepth);
+    return (depth == currentDepth) && (new_wl.degree > 0);
 }
 
 std::string
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 4ed3dcf3ac..5a55ad4cdc 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -137,8 +137,8 @@ class BSPBCWorkload : public GraphWorkload
     uint32_t countMask;
   public:
     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-        currentDepth(0), initAddr(init_addr), initValue(init_value),
-        depthMask(4278190080), countMask(16777215)
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
     {}
 
     ~BSPBCWorkload() {}
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f3210a8ec3..7e16b7e7de 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -50,6 +50,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 15062f1465..86b9ea2b02 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -81,6 +81,12 @@ CenteralController::createPRWorkload(float alpha)
     workload = new BSPPRWorkload(alpha);
 }
 
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
@@ -131,17 +137,13 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
-    // IDEA: Should this be here or after calling start?
-    // Point of iterate here is to set global variables.
-    // At this point, we know that vertex memory has been
-    // initialized and we can initialize global variables.
-    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
+    workload->iterate();
 }
 
 PacketPtr
@@ -175,15 +177,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
-            // IDEA: Should this be here or after calling start?
-            // Point of iterate here is to update global variables.
-            // At this point, we know that vertex memory has been
-            // updated and we can update global variables.
-            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }
         }
+        workload->iterate();
         exitSimLoopNow("finished an iteration.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index aa3938353d..ba829061b5 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,6 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createPRWorkload(float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 

From 492ccc09b5562bc7d67bebed3b4adf724116bf4c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 21:03:16 -0800
Subject: [PATCH 241/279] Fixing BC run script.

---
 configs/accl/bc.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 074bee73b9..56faeb3e4d 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -37,7 +37,6 @@ def get_inputs():
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
@@ -72,7 +71,6 @@ def get_inputs():
         args.num_registers,
         args.cache_size,
         args.graph,
-        args.iterations,
         args.init_addr,
         args.init_value,
         args.simple,
@@ -87,7 +85,6 @@ def get_inputs():
         num_registers,
         cache_size,
         graph,
-        iterations,
         init_addr,
         init_value,
         simple,
@@ -119,16 +116,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        iteration = 0
-        while iteration < iterations:
+        iterations = 0
+        while True:
             exit_event = m5.simulate()
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
             )
-            iteration += 1
+            iterations += 1
             if system.work_count() == 0:
                 break
-    print(f"#iterations: {iteration}")
+    print(f"#iterations: {iterations}")
     if verify:
         system.print_answer()

From f714d88469cc782c71b3958eeab6f0af3223d378 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Nov 2022 22:54:39 -0800
Subject: [PATCH 242/279] Fixing dirty issue in bsp.

---
 configs/accl/sega.py                   | 2 +-
 configs/accl/sega_simple.py            | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 32124731d6..672151ceed 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -152,7 +152,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff567b57e3..06908d08d3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -147,7 +147,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4fa400a63a..a2d4378377 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -114,7 +114,6 @@ CoalesceEngine::postMemInitSetup()
 void
 CoalesceEngine::postConsumeProcess()
 {
-    WorkListItem items[numElementsPerLine];
     Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
     for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
         Addr addr = peerMemoryRange.addIntlvBits(local_addr);
@@ -133,6 +132,7 @@ CoalesceEngine::postConsumeProcess()
                 if (cacheBlocks[block_index].items[index].activeFuture) {
                     cacheBlocks[block_index].items[index].activeFuture = false;
                     cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
                 }
             }
             if (!atom_active_future_before && atom_active_future_after) {
@@ -142,10 +142,10 @@ CoalesceEngine::postConsumeProcess()
                 futureActiveCacheBlocks.erase(block_index);
             }
         } else {
+            WorkListItem items[numElementsPerLine];
             PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
             memPort.sendFunctional(read_pkt);
             read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            delete read_pkt;
             bool atom_active_future_before = false;
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
@@ -166,6 +166,7 @@ CoalesceEngine::postConsumeProcess()
             }
             PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
             memPort.sendFunctional(write_pkt);
+            delete read_pkt;
             delete write_pkt;
         }
     }

From a342da41983388043f15d7cd8ab83bc62790ce01 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 19:26:29 -0800
Subject: [PATCH 243/279] Adding Async PR.

---
 configs/accl/async-pr.py                   | 125 +++++++++++++++++++++
 configs/accl/pr.py                         |   6 +-
 configs/accl/sega.py                       |   6 +
 configs/accl/sega_simple.py                |   3 +
 src/accl/graph/base/graph_workload.cc      |  78 +++++++++++++
 src/accl/graph/base/graph_workload.hh      |  30 ++++-
 src/accl/graph/sega/CenteralController.py  |   2 +
 src/accl/graph/sega/centeral_controller.cc |  13 +++
 src/accl/graph/sega/centeral_controller.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc     |  23 +++-
 src/accl/graph/sega/coalesce_engine.hh     |   3 +
 src/accl/graph/sega/wl_engine.cc           |   9 ++
 src/accl/graph/sega/wl_engine.hh           |   3 +
 13 files changed, 294 insertions(+), 9 deletions(-)
 create mode 100644 configs/accl/async-pr.py

diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..0bfb6caeaa
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index ea8a103640..42ae46ea78 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -34,6 +34,7 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
@@ -67,6 +68,7 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
@@ -80,6 +82,7 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
@@ -93,7 +96,7 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
@@ -121,6 +124,7 @@ def get_inputs():
                 + f"because {exit_event.getCause()}"
             )
             iteration += 1
+            print(f"error: {system.get_pr_error()}")
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 672151ceed..ef23575b9b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -201,9 +201,15 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def get_pr_error(self):
+        return self.ctrl.getPRError()
+
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
 
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 06908d08d3..d6ae8772a5 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -195,6 +195,9 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7bcd447b8e..3a401f0963 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -154,6 +154,81 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(new_wl, items[index]);
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -212,6 +287,9 @@ BSPPRWorkload::apply(WorkListItem& wl)
 void
 BSPPRWorkload::interIterationInit(WorkListItem& wl)
 {
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
     wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
     wl.activeFuture = (wl.degree > 0);
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 5a55ad4cdc..d42bfd0f26 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -105,13 +105,37 @@ class SSSPWorkload : public BFSWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 class BSPPRWorkload : public GraphWorkload
 {
   private:
     float alpha;
+    float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha) {}
+    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
 
     ~BSPPRWorkload() {}
 
@@ -119,10 +143,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() {}
+    virtual void iterate() { error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return error; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7e16b7e7de..c5f44c82e9 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -49,8 +49,10 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createBFSVisitedWorkload"),
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
+                    PyBindMethod("createAsyncPRWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
+                    PyBindMethod("getPRError"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 86b9ea2b02..23eb6bbc0e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -75,6 +75,12 @@ CenteralController::createCCWorkload()
     workload = new CCWorkload();
 }
 
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
@@ -196,6 +202,13 @@ CenteralController::workCount()
     return work_count;
 }
 
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
 void
 CenteralController::printAnswerToHostSimout()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ba829061b5..e73ed22666 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -73,12 +73,14 @@ class CenteralController : public ClockedObject
     void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
     void createPRWorkload(float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 
     int workCount();
+    float getPRError();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a2d4378377..02c98ba640 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextApplyEvent([this] {
         processNextApplyEvent();
         }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -552,8 +555,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
     return true;
 }
@@ -712,8 +715,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
 
-    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
-        owner->recvDoneSignal();
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -749,8 +753,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -1170,6 +1174,13 @@ CoalesceEngine::processNextApplyEvent()
     }
 }
 
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8ee17781fc..b6eec725f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -151,6 +151,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ed91622b43..d563450179 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -43,6 +43,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
     stats(*this)
 {
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
@@ -316,6 +317,14 @@ WLEngine::processNextReduceEvent()
     }
     workListFile.clear();
 
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
     if (done()) {
         owner->recvDoneSignal();
     }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 45baaa1e79..fb147e692a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -90,6 +90,9 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct WorkListStats : public statistics::Group
     {
       WorkListStats(WLEngine &worklist);

From a0406ed6fbad7f55a23d8246dffb65b98679a7e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 20:33:07 -0800
Subject: [PATCH 244/279] Fixing typos.

---
 configs/accl/pr.py                         | 14 ++++++++++++--
 configs/accl/sega.py                       |  4 ++--
 configs/accl/sega_simple.py                |  4 ++--
 src/accl/graph/base/graph_workload.cc      |  6 +++---
 src/accl/graph/base/graph_workload.hh      | 10 +++++++---
 src/accl/graph/sega/centeral_controller.cc |  4 ++--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 42ae46ea78..569514eb82 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -36,9 +36,11 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +75,8 @@ def get_inputs():
         args.graph,
         args.iterations,
         args.alpha,
+        args.num_nodes,
+        args.error_threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -87,11 +91,15 @@ def get_inputs():
         graph,
         iterations,
         alpha,
+        num_nodes,
+        error_threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
 
+    print(f"error_threshold: {error_threshold}")
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -103,7 +111,7 @@ def get_inputs():
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_pr_workload(num_nodes, alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -125,6 +133,8 @@ def get_inputs():
             )
             iteration += 1
             print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ef23575b9b..32d0dd26ab 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -204,8 +204,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def get_pr_error(self):
         return self.ctrl.getPRError()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index d6ae8772a5..2d36ec584d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -198,8 +198,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3a401f0963..ab58b02b73 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -240,8 +240,8 @@ BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
-        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
-        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
         new_wl.activeNow = activeCondition(new_wl, items[i]);
         atom_active |= new_wl.activeNow;
         items[i] = new_wl;
@@ -291,7 +291,7 @@ BSPPRWorkload::interIterationInit(WorkListItem& wl)
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
-    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
     wl.activeFuture = (wl.degree > 0);
 }
 
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index d42bfd0f26..72748502c1 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -131,11 +131,15 @@ class PRWorkload : public GraphWorkload
 class BSPPRWorkload : public GraphWorkload
 {
   private:
+    int numNodes;
     float alpha;
+    float prevError;
     float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
 
     ~BSPPRWorkload() {}
 
@@ -143,12 +147,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() { error = 0; }
+    virtual void iterate() { prevError = error; error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 
-    float getError() { return error; }
+    float getError() { return prevError; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 23eb6bbc0e..0aee3b77ce 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,9 +82,9 @@ CenteralController::createAsyncPRWorkload(float alpha, float threshold)
 }
 
 void
-CenteralController::createPRWorkload(float alpha)
+CenteralController::createPRWorkload(int num_nodes, float alpha)
 {
-    workload = new BSPPRWorkload(alpha);
+    workload = new BSPPRWorkload(num_nodes, alpha);
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index e73ed22666..cce9ac2725 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,7 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createAsyncPRWorkload(float alpha, float threshold);
-    void createPRWorkload(float alpha);
+    void createPRWorkload(int num_nodes, float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();

From 72567c4abd0a4364c53e055a836c681ff335ee21 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 7 Feb 2023 14:03:15 -0800
Subject: [PATCH 245/279] Fixing init in asyncPR.

---
 src/accl/graph/base/graph_workload.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ab58b02b73..fd802cf275 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -166,7 +166,8 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = readFromFloat<uint32_t>(0);
         new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(new_wl, items[index]);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
         items[index] = new_wl;
     }
     if (atom_active) {

From b56fe04ab1221641f29244192683d5740db5d9e1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 9 Mar 2023 11:27:37 -0800
Subject: [PATCH 246/279] Improving UniqueFIFO implementation.

---
 src/accl/graph/base/data_structs.hh    | 101 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |   6 ++
 src/accl/graph/sega/push_engine.cc     |   6 +-
 src/accl/graph/sega/work_directory.hh  |   1 +
 4 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f09a0dd167..a391e0794d 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,7 @@
 
 #include <algorithm>
 #include <cassert>
-#include <list>
+#include <deque>
 
 namespace gem5
 {
@@ -137,56 +137,107 @@ template<typename T>
 class UniqueFIFO
 {
   private:
-    std::list<T> fifo;
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
 
   public:
-    UniqueFIFO() {}
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
 
-    void push_back(T item)
-    {
-        if (!find(item)) {
-            fifo.push_back(item);
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
         }
+        container.clear();
     }
 
-    void pop_front()
-    {
-        assert(!fifo.empty());
-        fifo.pop_front();
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
     }
 
-    T front()
-    {
-        return fifo.front();
+    T front() {
+        fix_front();
+        return container.front();
     }
 
     size_t size() {
-        return fifo.size();
+        return pop;
     }
 
     void clear() {
-        fifo.clear();
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
     }
 
     bool empty() {
-        return fifo.empty();
+        return size() == 0;
     }
 
     bool find(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        return (it != fifo.end());
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
     }
 
     void erase(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        assert(it != fifo.end());
-        fifo.erase(it);
+        assert(find(item));
+        deleted[item]++;
+        pop--;
     }
 
     void operator=(const UniqueFIFO<T>& rhs) {
-        fifo = rhs.fifo;
+        pop = rhs.pop;
+        container = rhs.container;
+        added = rhs.added;
+        deleted = rhs.deleted;
     }
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 02c98ba640..8c38341f48 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -69,6 +69,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+
     activeBuffer.clear();
     postPushWBQueue.clear();
 }
@@ -404,6 +407,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
         ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+        // TODO: delete purpose
 
         // NOTE: Regardless of where the pkt will go we have to release the
         // reserved space for this pkt in the activeBuffer in case
@@ -553,6 +557,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 pullsScheduled++;
             }
         }
+        delete purpose;
     }
 
     if (done() && !nextDoneSignalEvent.scheduled()) {
@@ -999,6 +1004,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
     pullsScheduled--;
     if (!currentDirectory->empty()) {
         Addr addr = currentDirectory->getNextWork();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a8c9a1bcb1..981b581b7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -273,7 +273,9 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
 
-    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // TODO: Change above line to below line.
+    uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
@@ -291,7 +293,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
-    delete pkt_data;
+    // delete [] pkt_data;
     delete pkt;
 
     if (!nextPropagateEvent.scheduled()) {
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 18430aee0d..620e97f654 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -100,6 +100,7 @@ class PopCountDirectory: public WorkDirectory
         for (int index = 0; index < numCounters; index++) {
             popCount[index] = 0;
         }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
     }
 
     // CAUTION: This should only be called when the work

From bdec32a44646eba0e1f65361b532287ac367d8a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 21 Mar 2023 14:59:01 -0700
Subject: [PATCH 247/279] Improving sim performance for push engine.

---
 src/accl/graph/sega/coalesce_engine.cc |   1 -
 src/accl/graph/sega/push_engine.cc     | 157 ++++++++++++-------------
 src/accl/graph/sega/push_engine.hh     |  14 ++-
 3 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8c38341f48..fcdd26ceb4 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -407,7 +407,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
         ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
-        // TODO: delete purpose
 
         // NOTE: Regardless of where the pkt will go we have to release the
         // reserved space for this pkt in the activeBuffer in case
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 981b581b7c..4703e27d16 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -50,9 +51,13 @@ PushEngine::PushEngine(const Params& params):
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
     stats(*this)
 {
+    destinationQueues.clear();
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-        outPorts.emplace_back(
-                        name() + ".out_ports" + std::to_string(i), this, i);
+        outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
+        destinationQueues.emplace_back();
+        destinationQueues[i].clear();
+        sourceAndValueMaps.emplace_back();
+        sourceAndValueMaps[i].clear();
     }
 }
 
@@ -73,7 +78,10 @@ PushEngine::init()
 {
     localAddrRange = owner->getAddrRanges();
     for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+        AddrRangeList range_list = outPorts[i].getAddrRanges();
+        assert(range_list.size() == 1);
+        AddrRange range = outPorts[i].getAddrRanges().front();
+        portAddrMap.insert(range, i);
     }
 }
 
@@ -108,7 +116,8 @@ PushEngine::ReqPort::recvReqRetry()
     panic_if(blockedPacket == nullptr,
             "Received retry without a blockedPacket.");
 
-    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
@@ -145,7 +154,7 @@ PushEngine::done()
 {
     bool empty_update_queues = true;
     for (int i = 0; i < outPorts.size(); i++) {
-        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+        empty_update_queues &= destinationQueues[i].empty();
     }
     return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
@@ -273,8 +282,6 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
 
-    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
-    // TODO: Change above line to below line.
     uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
@@ -293,7 +300,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
-    // delete [] pkt_data;
+
     delete pkt;
 
     if (!nextPropagateEvent.scheduled()) {
@@ -316,10 +323,9 @@ PushEngine::processNextPropagateEvent()
 
         uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
-        Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
 
-        if (enqueueUpdate(update)) {
+        if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numPropagates++;
@@ -348,61 +354,54 @@ PushEngine::processNextPropagateEvent()
 }
 
 bool
-PushEngine::enqueueUpdate(Update update)
+PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
 {
-    Addr dst_addr = update.dst;
-    bool found_coalescing = false;
-    bool found_locally = false;
-    bool accepted = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(dst_addr);
-    }
-    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
-    for (int i = 0; i < outPorts.size(); i++) {
-        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        if (contains(addr_range_list, dst_addr)) {
-            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
-                        __func__, update.to_string(), outPorts[i].id());
-            DPRINTF(PushEngine, "%s: There are %d updates already "
+    Addr aligned_dst = roundDown<Addr, size_t>(dst, owner->vertexAtomSize());
+    AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize());
+    auto entry = portAddrMap.contains(update_range);
+    PortID port_id = entry->second;
+
+    DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} "
+                        "belongs to port %d.\n",
+                        __func__, src, dst, value, port_id);
+    DPRINTF(PushEngine, "%s: There are %d updates already "
                         "in queue for port %d.\n", __func__,
-                        updateQueues[outPorts[i].id()].size(),
-                        outPorts[i].id());
-            for (auto& entry: updateQueues[outPorts[i].id()]) {
-                Update& curr_update = std::get<0>(entry);
-                if (curr_update.dst == update.dst) {
-                    uint32_t old_value = curr_update.value;
-                    curr_update.value = graphWorkload->reduce(old_value, update.value);
-                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
-                            "for destination %d with new value: %d by "
-                            "coalescing %d and %d. \n", __func__, update.dst,
-                            curr_update.value, old_value, update.value);
-                    found_coalescing = true;
-                    accepted = true;
-                    stats.updateQueueCoalescions++;
-                }
-            }
-            if ((found_coalescing == false) &&
-                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
-                DPRINTF(PushEngine, "%s: There is a free entry available "
-                            "in queue %d.\n", __func__, outPorts[i].id());
-                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
-                            "of queue for port %d is. Size of queue "
-                            "for port %d is %d.\n", __func__,
-                            outPorts[i].id(), outPorts[i].id(),
-                            updateQueues[outPorts[i].id()].size());
-                accepted = true;
-                stats.updateQueueLength.sample(
-                                        updateQueues[outPorts[i].id()].size());
-            }
+                        destinationQueues[port_id].size(), port_id);
+
+    assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
+
+    if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
+        DPRINTF(PushEngine, "%s: Found an existing update "
+                            "for dst: %lu.\n", __func__, dst);
+        Addr prev_src;
+        uint32_t prev_val;
+        std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst];
+        uint32_t new_val = graphWorkload->reduce(value, prev_val);
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val);
+        DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} "
+                            "with Update{src: %lu, dst:%lu, value: %u} to"
+                            "Update{src: %lu, dst:%lu, value: %u}.\n", __func__,
+                            src, dst, value, prev_src, dst, prev_val,
+                            prev_src, dst, new_val);
+        stats.updateQueueCoalescions++;
+        return true;
+    } else if (destinationQueues[port_id].size() < updateQueueSize) {
+        DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue for port %d.\n", __func__, port_id);
+        destinationQueues[port_id].emplace_back(dst, curTick());
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value);
+        DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} "
+                            "at the back of queue for port %d. "
+                            "Size of queue for port %d is %d.\n", __func__,
+                            src, dst, value, port_id, port_id,
+                            destinationQueues[port_id].size());
+        stats.updateQueueLength.sample(destinationQueues[port_id].size());
+        if (!nextUpdatePushEvent.scheduled()) {
+            schedule(nextUpdatePushEvent, nextCycle());
         }
+        return true;
     }
-
-    if (accepted && (!nextUpdatePushEvent.scheduled())) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-
-    return accepted;
+    return false;
 }
 
 template<typename T> PacketPtr
@@ -429,30 +428,30 @@ PushEngine::processNextUpdatePushEvent()
 
     for (int i = 0; i < outPorts.size(); i++) {
         if (outPorts[i].blocked()) {
-            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
-                                __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i);
             continue;
         }
-        DPRINTF(PushEngine, "%s: Port %d available.\n",
-                                __func__, outPorts[i].id());
-        if (updateQueues[outPorts[i].id()].empty()) {
-            DPRINTF(PushEngine, "%s: Respective queue for port "
-                        "%d is empty.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i);
+        if (destinationQueues[i].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for "
+                                "port %d is empty.\n", __func__, i);
             continue;
         }
-        DPRINTF(PushEngine, "%s: Respective queue for port "
-                        "%d not empty.\n", __func__, outPorts[i].id());
-        Update update;
+        Addr dst;
         Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
-        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        std::tie(dst, entrance_tick) = destinationQueues[i].front();
+        Addr src;
+        uint32_t value;
+        std::tie(src, value) = sourceAndValueMaps[i][dst];
+
+        PacketPtr pkt = createUpdatePacket<uint32_t>(dst, value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
-                    "Respective queue size is %d.\n", __func__,
-                    update.to_string(), outPorts[i].id(),
-                    updateQueues[outPorts[i].id()].size());
-        updateQueues[outPorts[i].id()].pop_front();
-        if (updateQueues[outPorts[i].id()].size() > 0) {
+        destinationQueues[i].pop_front();
+        sourceAndValueMaps[i].erase(dst);
+        DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to "
+                    "port %d. Respective queue size is %d.\n", __func__,
+                    src, dst, value, i, destinationQueues[i].size());
+        if (destinationQueues[i].size() > 0) {
             next_time_send += 1;
         }
         stats.numUpdates++;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index f51865acb3..9f489455ac 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,10 +29,14 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
+#include <unordered_map>
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/sega/enums.hh"
+#include "base/addr_range_map.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -58,7 +62,6 @@ class PushEngine : public BaseMemoryEngine
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
-        PortID id() { return _id; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -110,12 +113,14 @@ class PushEngine : public BaseMemoryEngine
 
         bool done() { return (_start >= _end); }
     };
+
     struct PushInfo {
         Addr src;
         uint32_t value;
         Addr offset;
         int numElements;
     };
+
     MPU* owner;
     GraphWorkload* graphWorkload;
 
@@ -136,9 +141,10 @@ class PushEngine : public BaseMemoryEngine
 
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
-    bool enqueueUpdate(Update update);
-    std::unordered_map<PortID, AddrRangeList> portAddrMap;
-    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
+    bool enqueueUpdate(Addr src, Addr dst, uint32_t value);
+    std::vector<std::deque<std::tuple<Addr, Tick>>> destinationQueues;
+    std::vector<std::unordered_map<Addr, std::tuple<Addr, uint32_t>>> sourceAndValueMaps;
+    AddrRangeMap<PortID> portAddrMap;
     std::vector<ReqPort> outPorts;
 
     bool vertexSpace();

From 8e134590077f901c5ca60d9af159fd9c854cb820 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 28 Mar 2023 19:28:44 -0700
Subject: [PATCH 248/279] Randomizing retry sending order.

---
 src/accl/graph/sega/wl_engine.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index d563450179..442d051e43 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,6 +28,10 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include <algorithm>
+#include <random>
+#include <vector>
+
 #include "accl/graph/sega/mpu.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
@@ -135,8 +139,16 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::checkRetryReq()
 {
+    std::vector<int> random_shuffle;
+    for (int i = 0; i < inPorts.size(); i++) {
+        random_shuffle.push_back(i);
+    }
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen);
+
     for (int i = 0; i < inPorts.size(); i++) {
-        inPorts[i].checkRetryReq();
+        inPorts[random_shuffle[i]].checkRetryReq();
     }
 }
 

From 63245ec1e270ee79b3bee71e5de3d5630c4e9242 Mon Sep 17 00:00:00 2001
From: Ayaz Akram <yazakram@ucdavis.edu>
Date: Thu, 10 Nov 2022 13:26:44 -0800
Subject: [PATCH 249/279] mem: HBMCtrl changes to allow PC data buses to be in
 different states

This change updates the HBMCtrl such that both pseudo channels
can be in separate states (read or write) at the same time. In
addition, the controller queues are now always split in two
halves for both pseudo channels.

Change-Id: Ifb599e611ad99f6c511baaf245bad2b5c9210a86
---
 src/mem/hbm_ctrl.cc |  4 ++++
 src/mem/mem_ctrl.cc | 28 ++++++++++++++++++++++++++++
 src/mem/mem_ctrl.hh |  8 ++++++++
 3 files changed, 40 insertions(+)

diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 62a3254364..9f6d81243d 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -289,7 +289,11 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
             if (readQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
+<<<<<<< HEAD
                 retryRdReq = true;
+=======
+                MemCtrl::retryRdReq = true;
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
                 stats.numRdRetry++;
                 return false;
             } else {
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 6344e7c228..1f55a3f33d 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -767,7 +767,11 @@ MemCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_cmds_per_burst,
 }
 
 bool
+<<<<<<< HEAD
 MemCtrl::inReadBusState(bool next_state, const MemInterface* mem_intr) const
+=======
+MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 {
     // check the bus state
     if (next_state) {
@@ -780,7 +784,11 @@ MemCtrl::inReadBusState(bool next_state, const MemInterface* mem_intr) const
 }
 
 bool
+<<<<<<< HEAD
 MemCtrl::inWriteBusState(bool next_state, const MemInterface* mem_intr) const
+=======
+MemCtrl::inWriteBusState(bool next_state, MemInterface* mem_intr) const
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 {
     // check the bus state
     if (next_state) {
@@ -901,14 +909,24 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     if (switched_cmd_type) {
         if (mem_intr->busState == MemCtrl::READ) {
             DPRINTF(MemCtrl,
+<<<<<<< HEAD
             "Switching to writes after %d reads with %d reads "
             "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
+=======
+                    "Switching to writes after %d reads with %d reads "
+                    "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
             stats.rdPerTurnAround.sample(mem_intr->readsThisTime);
             mem_intr->readsThisTime = 0;
         } else {
             DPRINTF(MemCtrl,
+<<<<<<< HEAD
             "Switching to reads after %d writes with %d writes "
             "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
+=======
+                    "Switching to reads after %d writes with %d writes "
+                    "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
             stats.wrPerTurnAround.sample(mem_intr->writesThisTime);
             mem_intr->writesThisTime = 0;
         }
@@ -1037,8 +1055,12 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
             // Also ensure that we've issued a minimum defined number
             // of reads before switching, or have emptied the readQ
             if ((mem_intr->writeQueueSize > writeHighThreshold) &&
+<<<<<<< HEAD
                (mem_intr->readsThisTime >= minReadsPerSwitch ||
                mem_intr->readQueueSize == 0)
+=======
+               (mem_intr->readsThisTime >= minReadsPerSwitch || mem_intr->readQueueSize == 0)
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
                && !(nvmWriteBlock(mem_intr))) {
                 switch_to_writes = true;
             }
@@ -1427,8 +1449,14 @@ MemCtrl::drain()
 {
     // if there is anything in any of our internal queues, keep track
     // of that as well
+<<<<<<< HEAD
     if (totalWriteQueueSize || totalReadQueueSize || !respQEmpty() ||
           !allIntfDrained()) {
+=======
+    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQEmpty() &&
+          allIntfDrained())) {
+
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
         DPRINTF(Drain, "Memory controller not drained, write: %d, read: %d,"
                 " resp: %d\n", totalWriteQueueSize, totalReadQueueSize,
                 respQueue.size());
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index 917798ffa7..2de4184a5a 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -762,7 +762,11 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a read state
      */
+<<<<<<< HEAD
     bool inReadBusState(bool next_state, const MemInterface* mem_intr) const;
+=======
+    bool inReadBusState(bool next_state, MemInterface* mem_intr) const;
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 
     /**
      * Check the current direction of the memory channel
@@ -770,7 +774,11 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a write state
      */
+<<<<<<< HEAD
     bool inWriteBusState(bool next_state, const MemInterface* mem_intr) const;
+=======
+    bool inWriteBusState(bool next_state, MemInterface* mem_intr) const;
+>>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 
     Port &getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;

From 76a12f10bbab5011c3d190517cc6d2ba1771e872 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 28 Mar 2023 19:17:46 -0700
Subject: [PATCH 250/279] Improving wlengine model.

---
 configs/accl/sega.py             |  18 ++-
 configs/accl/sega_simple.py      |   6 +-
 src/accl/graph/sega/WLEngine.py  |   4 +
 src/accl/graph/sega/enums.cc     |   6 +
 src/accl/graph/sega/enums.hh     |   9 ++
 src/accl/graph/sega/wl_engine.cc | 267 ++++++++++++++++++++-----------
 src/accl/graph/sega/wl_engine.hh |  14 +-
 7 files changed, 228 insertions(+), 96 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 32d0dd26ab..dc7dbabb70 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -50,7 +50,11 @@ class GPT(SubSystem):
     def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            rd_per_cycle=2,
+            reduce_per_cycle=32,
+            wr_per_cycle=2,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -109,6 +113,7 @@ def set_vertex_range(self, vertex_ranges):
     def set_vertex_pch_bit(self, pch_bit):
         self.vertex_mem_ctrl.pch_bit = pch_bit
 
+
 class EdgeMemory(SubSystem):
     def __init__(self, size: str):
         super(EdgeMemory, self).__init__()
@@ -133,6 +138,7 @@ def getPort(self):
     def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
+
 class SEGA(System):
     def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
@@ -148,10 +154,12 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.mem_mode = "timing"
 
         # Building the CenteralController
-        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
         # Building the EdgeMemories
         edge_mem = []
-        for i in range(int(num_gpts/2)):
+        for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
@@ -167,7 +175,9 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
                 [vertex_ranges[i], vertex_ranges[i + num_gpts]]
             )
             gpt.set_vertex_pch_bit(pch_bit)
-            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 2d36ec584d..312e721b0c 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -50,7 +50,11 @@ class GPT(SubSystem):
     def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            rd_per_cycle=2,
+            reduce_per_cycle=32,
+            wr_per_cycle=2,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 5a8ed9c9fd..0940e6b718 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -44,3 +44,7 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.")
+
+    rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
+    reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
+    wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index f7ef96197f..2f1bc983eb 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -31,6 +31,12 @@
 namespace gem5
 {
 
+const char* registerStateStrings[NUM_REGISTER_STATE] = {
+    "PENDING_READ",
+    "PENDING_REDUCE",
+    "PENDING_WRITE"
+};
+
 const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "INVALID",
     "PENDING_DATA",
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index f97c33a0e0..4e7d64235e 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -32,6 +32,15 @@
 namespace gem5
 {
 
+enum RegisterState
+{
+    PENDING_READ,
+    PENDING_REDUCE,
+    PENDING_WRITE,
+    NUM_REGISTER_STATE
+};
+extern const char* registerStateStrings[NUM_REGISTER_STATE];
+
 enum CacheState
 {
     INVALID,
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 442d051e43..cf9599aeef 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -44,9 +44,13 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
+    maxReadsPerCycle(params.rd_per_cycle),
+    maxReducesPerCycle(params.reduce_per_cycle),
+    maxWritesPerCycle(params.wr_per_cycle),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextWriteEvent([this] { processNextWriteEvent(); }, name()),
     nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
     stats(*this)
 {
@@ -190,89 +194,112 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
 void
 WLEngine::processNextReadEvent()
 {
-    Addr update_addr;
-    uint32_t update_value;
-    Tick enter_tick;
-    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
-
-    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+    int num_reads = 0;
+    while (true) {
+        Addr update_addr;
+        uint32_t update_value;
+        Tick enter_tick;
+        std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
+
+        DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
 
-    if ((registerFile.find(update_addr) == registerFile.end())) {
-        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
-                            "in registerFile.\n", __func__, update_addr);
-        if (registerFile.size() < registerFileSize) {
-            DPRINTF(WLEngine, "%s: There are free registers available in the "
-                                            "registerFile.\n", __func__);
-            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
-            if (read_status == ReadReturnStatus::ACCEPT) {
-                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
-                            "request to addr: %lu.\n", __func__, update_addr);
-                registerFile[update_addr] = update_value;
-                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
-                        "to registerFile. registerFile.size = %d, "
-                        "registerFileSize = %d.\n", __func__, update_addr,
-                        update_value, registerFile.size(), registerFileSize);
-                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
-                        "to registerFile. registerFile.size = %d, "
-                        "registerFileSize = %d.\n", __func__, update_addr,
-                        update_value, registerFile.size(), registerFileSize);
+        if ((registerFile.find(update_addr) == registerFile.end())) {
+            DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                                "in registerFile.\n", __func__, update_addr);
+            if (registerFile.size() < registerFileSize) {
+                DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                                "registerFile.\n", __func__);
+                ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+                if (read_status == ReadReturnStatus::ACCEPT) {
+                    DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                                "request to addr: %lu.\n", __func__, update_addr);
+                    registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value);
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    updateQueue.pop_front();
+                    stats.updateQueueLatency.sample(
+                            (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                    DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    vertexReadTime[update_addr] = curTick();
+                    checkRetryReq();
+                } else {
+                    if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                        updateQueue.pop_front();
+                        updateQueue.emplace_back(
+                                            update_addr, update_value, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                            "Rolling the update.\n", __func__);
+                        stats.numUpdateRolls++;
+                    } else {
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Not rolling the update.\n", __func__);
+                    }
+                }
+            } else {
+                DPRINTF(WLEngine, "%s: There are no free registers "
+                        "available in the registerFile.\n", __func__);
+                stats.registerShortage++;
+            }
+        } else {
+            RegisterState state = std::get<0>(registerFile[update_addr]);
+            if (state == RegisterState::PENDING_WRITE) {
+                // NOTE: If it's pending write, let it be written.
+                updateQueue.pop_front();
+                updateQueue.emplace_back(update_addr, update_value, enter_tick);
+            } else {
+                DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                            "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                        __func__, update_addr, update_addr, std::get<1>(registerFile[update_addr]));
+                uint32_t curr_value = std::get<1>(registerFile[update_addr]);
+                uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
+                registerFile[update_addr] = std::make_tuple(state, new_value);
+                DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                            " registerFile. registerFile[%lu] = %u.\n", __func__,
+                            update_value, update_addr, std::get<1>(registerFile[update_addr]));
+                stats.registerFileCoalesce++;
                 updateQueue.pop_front();
                 stats.updateQueueLatency.sample(
-                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                                (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
-                            "from updateQueue. updateQueue.size = %d. "
-                            "updateQueueSize = %d.\n", __func__, update_addr,
-                            update_value, updateQueue.size(), updateQueueSize);
+                                    "from updateQueue. updateQueue.size = %d. "
+                                    "updateQueueSize = %d.\n", __func__, update_addr,
+                                    update_value, updateQueue.size(), updateQueueSize);
                 DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
                 checkRetryReq();
-                vertexReadTime[update_addr] = curTick();
-            } else {
-                if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                    updateQueue.pop_front();
-                    updateQueue.emplace_back(
-                                        update_addr, update_value, enter_tick);
-                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                        "Rolling the update.\n", __func__);
-                    stats.numUpdateRolls++;
-                } else {
-                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                    "Not rolling the update.\n", __func__);
-                }
             }
-        } else {
-            DPRINTF(WLEngine, "%s: There are no free registers "
-                    "available in the registerFile.\n", __func__);
-            stats.registerShortage++;
         }
-    } else {
-        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
-                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
-                __func__, update_addr, update_addr, registerFile[update_addr]);
-        registerFile[update_addr] =
-                graphWorkload->reduce(update_value, registerFile[update_addr]);
-        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
-                    " registerFile. registerFile[%lu] = %u.\n", __func__,
-                    update_value, update_addr, registerFile[update_addr]);
-        stats.registerFileCoalesce++;
-        updateQueue.pop_front();
-        stats.updateQueueLatency.sample(
-                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
-        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
-                            "from updateQueue. updateQueue.size = %d. "
-                            "updateQueueSize = %d.\n", __func__, update_addr,
-                            update_value, updateQueue.size(), updateQueueSize);
-        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
-                    "from updateQueue. updateQueue.size = %d. "
-                    "updateQueueSize = %d.\n", __func__, update_addr,
-                    update_value, updateQueue.size(), updateQueueSize);
-        checkRetryReq();
+
+        num_reads++;
+        if (num_reads >= maxReadsPerCycle) {
+            // NOTE: Add stat here to count read port shortage.
+            break;
+        }
+        if (updateQueue.empty()) {
+            break;
+        }
     }
 
-    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+    if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -281,6 +308,7 @@ void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
     assert(workListFile.size() <= registerFileSize);
+    assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ);
 
     workListFile[addr] = wl;
     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
@@ -290,11 +318,14 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                 graphWorkload->printWorkListItem(wl), workListFile.size());
 
+    uint32_t value = std::get<0>(registerFile[addr]);
+    registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
+    toReduce.push_back(addr);
+
     stats.vertexReadLatency.sample(
         ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
     vertexReadTime.erase(addr);
 
-    assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -303,35 +334,93 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    for (auto &it : workListFile) {
-        Addr addr = it.first;
-        assert(registerFile.find(addr) != registerFile.end());
-        uint32_t update_value = registerFile[addr];
-        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
-                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-                    __func__, addr, registerFile[addr], addr,
-                    graphWorkload->printWorkListItem(workListFile[addr]));
-        // TODO: Generalize this to reduce function rather than just min
+
+    // for (auto &it : workListFile) {
+    //     Addr addr = it.first;
+    //     assert(registerFile.find(addr) != registerFile.end());
+    //     uint32_t update_value = registerFile[addr];
+    //     DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+    //                 ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+    //                 __func__, addr, registerFile[addr], addr,
+    //                 graphWorkload->printWorkListItem(workListFile[addr]));
+    //     // TODO: Generalize this to reduce function rather than just min
+    //     workListFile[addr].tempProp =
+    //         graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+    //     DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+    //     __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
+    //     stats.numReduce++;
+
+    //     owner->recvWLWrite(addr, workListFile[addr]);
+    //     registerFile.erase(addr);
+    //     DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+    //                 "registerFile.size = %d, registerFileSize = %d\n",
+    //                 __func__, addr, registerFile.size(), registerFileSize);
+    //     DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+    //                 "registerFile.size = %d, registerFileSize = %d\n",
+    //                 __func__, addr, registerFile.size(), registerFileSize);
+    // }
+    // workListFile.clear();
+
+    int num_reduces = 0;
+    while (true) {
+        Addr addr = toReduce.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
+        uint32_t update_value = std::get<1>(registerFile[addr]);
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
-        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
-        stats.numReduce++;
+        registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
+        num_reduces++;
+        toReduce.pop_front();
+        toWrite.push_back(addr);
+
+        if (num_reduces >= maxReducesPerCycle) {
+            // TODO: Add stat to count reducer shortage;
+            break;
+        }
+        if (toReduce.empty()) {
+            break;
+        }
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
 
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+    // if (done() && !nextDoneSignalEvent.scheduled()) {
+    //     schedule(nextDoneSignalEvent, nextCycle());
+    // }
+}
+
+void
+WLEngine::processNextWriteEvent()
+{
+    int num_writes = 0;
+    while (true) {
+        Addr addr = toWrite.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE);
         owner->recvWLWrite(addr, workListFile[addr]);
         registerFile.erase(addr);
-        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
-                    "registerFile.size = %d, registerFileSize = %d\n",
-                    __func__, addr, registerFile.size(), registerFileSize);
-        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
-                    "registerFile.size = %d, registerFileSize = %d\n",
-                    __func__, addr, registerFile.size(), registerFileSize);
+        workListFile.erase(addr);
+        toWrite.pop_front();
+        num_writes++;
+        if (num_writes >= maxWritesPerCycle) {
+            break;
+        }
+        if (toWrite.empty()) {
+            break;
+        }
     }
-    workListFile.clear();
 
     if (done() && !nextDoneSignalEvent.scheduled()) {
         schedule(nextDoneSignalEvent, nextCycle());
     }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index fb147e692a..bd32b16d9d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -79,10 +79,17 @@ class WLEngine : public BaseReduceEngine
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
+    int maxReadsPerCycle;
+    int maxReducesPerCycle;
+    int maxWritesPerCycle;
+
     int registerFileSize;
-    std::unordered_map<Addr, uint32_t> registerFile;
-    std::unordered_map<Addr, Tick> vertexReadTime;
+    std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;
+    std::deque<Addr> toReduce;
+    std::deque<Addr> toWrite;
+
+    std::unordered_map<Addr, Tick> vertexReadTime;
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
@@ -90,6 +97,9 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    EventFunctionWrapper nextWriteEvent;
+    void processNextWriteEvent();
+
     EventFunctionWrapper nextDoneSignalEvent;
     void processNextDoneSignalEvent();
 

From 540694b7024b02d4c62827a60916de21b6372ad4 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 1 Apr 2023 14:35:52 -0700
Subject: [PATCH 251/279] Improving vertex access time by improving updateQeueu
 reads + more stats

---
 configs/accl/sega.py             | 13 +++-----
 src/accl/graph/sega/WLEngine.py  |  2 ++
 src/accl/graph/sega/wl_engine.cc | 53 ++++++++++++++++++++++++++------
 src/accl/graph/sega/wl_engine.hh |  5 +++
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index dc7dbabb70..58a8caddde 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -52,9 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
-            rd_per_cycle=2,
+            rd_per_cycle=4,
             reduce_per_cycle=32,
-            wr_per_cycle=2,
+            wr_per_cycle=4,
+            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -73,12 +74,8 @@ def __init__(self, register_file_size: int, cache_size: str):
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-            dram_2=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
+            dram=HBM_2000_4H_1x64(),
+            dram_2=HBM_2000_4H_1x64(),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 0940e6b718..cfec70081d 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -48,3 +48,5 @@ class WLEngine(BaseReduceEngine):
     rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
     reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
     wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
+    
+    num_updates_processed = Param.Int("Maximum number of updates processed")
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index cf9599aeef..276fcd1281 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -47,6 +47,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     maxReadsPerCycle(params.rd_per_cycle),
     maxReducesPerCycle(params.reduce_per_cycle),
     maxWritesPerCycle(params.wr_per_cycle),
+    maxUpdatesProcessed(params.num_updates_processed),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
@@ -171,6 +172,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
+    stats.numberIncomingUpdaes++;
     DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
                 "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
                 __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
@@ -195,11 +197,23 @@ void
 WLEngine::processNextReadEvent()
 {
     int num_reads = 0;
+    int num_tries = 0;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> tempQueue;
+
+    for (int i = 0; i < maxUpdatesProcessed; i++) {
+        if (updateQueue.empty()) {
+            break;
+        }
+        tempQueue.push_back(updateQueue.front());
+        updateQueue.pop_front();
+    }
+
     while (true) {
+        num_tries += 1;
         Addr update_addr;
         uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
+        std::tie(update_addr, update_value, enter_tick) = tempQueue.front();
 
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -223,13 +237,14 @@ WLEngine::processNextReadEvent()
                             "to registerFile. registerFile.size = %d, "
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
-                    updateQueue.pop_front();
+                    tempQueue.pop_front();
+                    num_reads++;
                     stats.updateQueueLatency.sample(
                             (curTick() - enter_tick) * 1e9 / getClockFrequency());
                     DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
-                                update_value, updateQueue.size(), updateQueueSize);
+                                update_value, tempQueue.size(), updateQueueSize);
                     DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
@@ -238,8 +253,8 @@ WLEngine::processNextReadEvent()
                     checkRetryReq();
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                        updateQueue.pop_front();
-                        updateQueue.emplace_back(
+                        tempQueue.pop_front();
+                        tempQueue.emplace_back(
                                             update_addr, update_value, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
@@ -258,8 +273,8 @@ WLEngine::processNextReadEvent()
             RegisterState state = std::get<0>(registerFile[update_addr]);
             if (state == RegisterState::PENDING_WRITE) {
                 // NOTE: If it's pending write, let it be written.
-                updateQueue.pop_front();
-                updateQueue.emplace_back(update_addr, update_value, enter_tick);
+                tempQueue.pop_front();
+                tempQueue.emplace_back(update_addr, update_value, enter_tick);
             } else {
                 DPRINTF(WLEngine,  "%s: A register has already been allocated for "
                             "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
@@ -271,7 +286,7 @@ WLEngine::processNextReadEvent()
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
                 stats.registerFileCoalesce++;
-                updateQueue.pop_front();
+                tempQueue.pop_front();
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
@@ -286,16 +301,26 @@ WLEngine::processNextReadEvent()
             }
         }
 
-        num_reads++;
+        // num_reads++;
         if (num_reads >= maxReadsPerCycle) {
             // NOTE: Add stat here to count read port shortage.
+            stats.numReadPortShortage++;
             break;
         }
-        if (updateQueue.empty()) {
+        if (num_tries > maxUpdatesProcessed) {
+            break;
+        }
+
+        if (tempQueue.empty()) {
             break;
         }
     }
 
+    for (int i = 0; i < tempQueue.size(); i++){
+        updateQueue.push_front(tempQueue.back());
+        tempQueue.pop_back();
+    }
+
     if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -407,6 +432,7 @@ WLEngine::processNextWriteEvent()
         toWrite.pop_front();
         num_writes++;
         if (num_writes >= maxWritesPerCycle) {
+            stats.numWritePortShortage++;
             break;
         }
         if (toWrite.empty()) {
@@ -444,6 +470,12 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
              "Number of times an update has been rolled back "
              "to the back of the update queue due to cache reject."),
+    ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
+             "Number of times limited by read per cycle."),
+    ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
+             "Number of times limited by write per cycle."),
+    ADD_STAT(numberIncomingUpdaes, statistics::units::Count::get(),
+              "Number of inocoming updates for each GPT."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
     ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
@@ -458,6 +490,7 @@ WLEngine::WorkListStats::regStats()
 
     vertexReadLatency.init(64);
     updateQueueLatency.init(64);
+
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index bd32b16d9d..8f55ecadd4 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -83,6 +83,8 @@ class WLEngine : public BaseReduceEngine
     int maxReducesPerCycle;
     int maxWritesPerCycle;
 
+    int maxUpdatesProcessed;
+
     int registerFileSize;
     std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;
@@ -115,6 +117,9 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
       statistics::Scalar numUpdateRolls;
+      statistics::Scalar numReadPortShortage;
+      statistics::Scalar numWritePortShortage;
+      statistics::Scalar numberIncomingUpdaes;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From edeae086f7b848905b6bb3499f771b2d3c671679 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 1 Apr 2023 15:05:15 -0700
Subject: [PATCH 252/279] Cleaning up wl_engine.cc

---
 src/accl/graph/sega/wl_engine.cc | 34 --------------------------------
 1 file changed, 34 deletions(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 276fcd1281..69e874c0d6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -191,8 +191,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
-// TODO: Parameterize the number of pops WLEngine can do at a time.
-// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
 WLEngine::processNextReadEvent()
 {
@@ -301,9 +299,7 @@ WLEngine::processNextReadEvent()
             }
         }
 
-        // num_reads++;
         if (num_reads >= maxReadsPerCycle) {
-            // NOTE: Add stat here to count read port shortage.
             stats.numReadPortShortage++;
             break;
         }
@@ -359,33 +355,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-
-    // for (auto &it : workListFile) {
-    //     Addr addr = it.first;
-    //     assert(registerFile.find(addr) != registerFile.end());
-    //     uint32_t update_value = registerFile[addr];
-    //     DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
-    //                 ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-    //                 __func__, addr, registerFile[addr], addr,
-    //                 graphWorkload->printWorkListItem(workListFile[addr]));
-    //     // TODO: Generalize this to reduce function rather than just min
-    //     workListFile[addr].tempProp =
-    //         graphWorkload->reduce(update_value, workListFile[addr].tempProp);
-    //     DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-    //     __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
-    //     stats.numReduce++;
-
-    //     owner->recvWLWrite(addr, workListFile[addr]);
-    //     registerFile.erase(addr);
-    //     DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
-    //                 "registerFile.size = %d, registerFileSize = %d\n",
-    //                 __func__, addr, registerFile.size(), registerFileSize);
-    //     DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
-    //                 "registerFile.size = %d, registerFileSize = %d\n",
-    //                 __func__, addr, registerFile.size(), registerFileSize);
-    // }
-    // workListFile.clear();
-
     int num_reduces = 0;
     while (true) {
         Addr addr = toReduce.front();
@@ -414,9 +383,6 @@ WLEngine::processNextReduceEvent()
     if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
-    // if (done() && !nextDoneSignalEvent.scheduled()) {
-    //     schedule(nextDoneSignalEvent, nextCycle());
-    // }
 }
 
 void

From feedf2012646a168f02967b2064a6e89479094c0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 3 Apr 2023 13:00:36 -0700
Subject: [PATCH 253/279] Fixing = operator for UniqueFIFO.

---
 src/accl/graph/base/data_structs.hh    | 13 +++++-
 src/accl/graph/sega/coalesce_engine.cc | 38 +++++++++++++--
 src/accl/graph/sega/coalesce_engine.hh |  5 ++
 src/accl/graph/sega/wl_engine.cc       | 64 ++++++++++++++------------
 4 files changed, 85 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index a391e0794d..60391b3a7c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,6 +34,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstring>
 #include <deque>
 
 namespace gem5
@@ -166,6 +167,11 @@ class UniqueFIFO
         container.clear();
     }
 
+    ~UniqueFIFO() {
+        delete [] added;
+        delete [] deleted;
+    }
+
     void fix_front() {
         while(true) {
             T elem = container.front();
@@ -234,10 +240,13 @@ class UniqueFIFO
     }
 
     void operator=(const UniqueFIFO<T>& rhs) {
+        cap = rhs.cap;
         pop = rhs.pop;
         container = rhs.container;
-        added = rhs.added;
-        deleted = rhs.deleted;
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+        std::memcpy(added, rhs.added, cap * sizeof(int));
+        std::memcpy(deleted, rhs.deleted, cap * sizeof(int));
     }
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fcdd26ceb4..dcc7feb3dd 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,9 +45,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0),
-    pendingPullLimit(params.pending_pull_limit),
+    lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0), pendingPullLimit(params.pending_pull_limit),
     pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
     nextMemoryEvent([this] {
@@ -74,6 +73,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
 
     activeBuffer.clear();
     postPushWBQueue.clear();
+    blocksTouchedThisTick.clear();
 }
 
 void
@@ -247,6 +247,10 @@ CoalesceEngine::recvWLRead(Addr addr)
     assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
+    if (lastReadTick < curTick()) {
+        blocksTouchedThisTick.clear();
+        lastReadTick = curTick();
+    }
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
@@ -289,9 +293,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 
+        blocksTouchedThisTick.insert(block_index);
         if (!nextResponseEvent.scheduled()) {
             schedule(nextResponseEvent, nextCycle());
         }
+
         stats.numVertexReads++;
         return ReadReturnStatus::ACCEPT;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
@@ -310,6 +316,8 @@ CoalesceEngine::recvWLRead(Addr addr)
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+        blocksTouchedThisTick.insert(block_index);
+
         stats.numVertexReads++;
         return ReadReturnStatus::ACCEPT;
     } else {
@@ -317,6 +325,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(cacheBlocks[block_index].addr != aligned_addr);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         stats.readMisses++;
+        if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been "
+                            "accessed this tick.\n", __func__, block_index);
+            return ReadReturnStatus::REJECT_ROLL;
+        }
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
             // conflict miss
             DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
@@ -324,6 +337,8 @@ CoalesceEngine::recvWLRead(Addr addr)
             cacheBlocks[block_index].hasConflict = true;
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 if (cacheBlocks[block_index].dirty) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n",
+                                                        __func__, block_index);
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
@@ -334,10 +349,14 @@ CoalesceEngine::recvWLRead(Addr addr)
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
                     }
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now "
+                            "pending write back.\n", __func__, block_index);
                 } else {
                     // NOTE: The cache block could still be active but
                     // not dirty. If active we only have to active tracking
                     // but can throw the data away.
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n",
+                                                        __func__, block_index);
                     bool atom_active_now = false;
                     bool atom_active_future = false;
                     for (int index = 0; index < numElementsPerLine; index++) {
@@ -345,12 +364,16 @@ CoalesceEngine::recvWLRead(Addr addr)
                         atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                     }
                     if (atom_active_now) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
+                                                        __func__, block_index);
                         currentActiveCacheBlocks.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                         stats.currentFrontierSize.sample(currentDirectory->workCount());
                         stats.currentBlockActiveCount.sample(count);
                     }
                     if (atom_active_future) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
+                                                        __func__, block_index);
                         futureActiveCacheBlocks.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                         stats.futureFrontierSize.sample(futureDirectory->workCount());
@@ -360,9 +383,13 @@ CoalesceEngine::recvWLRead(Addr addr)
                     // NOTE: Above line where we set hasConflict to true
                     // does not matter anymore since we reset the cache line.
                     cacheBlocks[block_index].reset();
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n",
+                                                        __func__, block_index);
                 }
+                blocksTouchedThisTick.insert(block_index);
                 return ReadReturnStatus::REJECT_NO_ROLL;
             } else {
+                blocksTouchedThisTick.insert(block_index);
                 stats.numConflicts++;
                 return ReadReturnStatus::REJECT_ROLL;
             }
@@ -386,6 +413,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 (!nextMemoryEvent.scheduled())) {
                 schedule(nextMemoryEvent, nextCycle());
             }
+            blocksTouchedThisTick.insert(block_index);
             return ReadReturnStatus::ACCEPT;
         }
     }
@@ -497,7 +525,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             responseQueue.size());
                 DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                             "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, addr,
+                            __func__, miss_addr,
                             graphWorkload->printWorkListItem(
                                 cacheBlocks[block_index].items[wl_offset]),
                             responseQueue.size());
@@ -798,6 +826,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
@@ -829,6 +858,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
+                assert(cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b6eec725f9..f01475118a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,6 +29,8 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
+#include <unordered_set>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
@@ -107,6 +109,9 @@ class CoalesceEngine : public BaseMemoryEngine
     int numElementsPerLine;
     Block* cacheBlocks;
 
+    Tick lastReadTick;
+    std::unordered_set<int> blocksTouchedThisTick;
+
     int onTheFlyReqs;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 69e874c0d6..0c96689a5a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -195,23 +195,24 @@ void
 WLEngine::processNextReadEvent()
 {
     int num_reads = 0;
+    int num_popped = 0;
     int num_tries = 0;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> tempQueue;
-
+    std::deque<std::tuple<Addr, uint32_t, Tick>> temp_queue;
     for (int i = 0; i < maxUpdatesProcessed; i++) {
         if (updateQueue.empty()) {
             break;
         }
-        tempQueue.push_back(updateQueue.front());
+        temp_queue.push_back(updateQueue.front());
         updateQueue.pop_front();
     }
 
+    int max_visits = temp_queue.size();
+
     while (true) {
-        num_tries += 1;
         Addr update_addr;
         uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = tempQueue.front();
+        std::tie(update_addr, update_value, enter_tick) = temp_queue.front();
 
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -235,48 +236,54 @@ WLEngine::processNextReadEvent()
                             "to registerFile. registerFile.size = %d, "
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
-                    tempQueue.pop_front();
+                    temp_queue.pop_front();
                     num_reads++;
+                    num_popped++;
                     stats.updateQueueLatency.sample(
                             (curTick() - enter_tick) * 1e9 / getClockFrequency());
                     DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
-                                update_value, tempQueue.size(), updateQueueSize);
+                                update_value, temp_queue.size(), updateQueueSize);
                     DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
                                 update_value, updateQueue.size(), updateQueueSize);
                     vertexReadTime[update_addr] = curTick();
-                    checkRetryReq();
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                        tempQueue.pop_front();
-                        tempQueue.emplace_back(
-                                            update_addr, update_value, enter_tick);
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
                         stats.numUpdateRolls++;
                     } else {
-                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                        "Not rolling the update.\n", __func__);
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject with no roll "
+                        "from cache. Rolling the update anyway.\n", __func__);
                     }
                 }
             } else {
                 DPRINTF(WLEngine, "%s: There are no free registers "
                         "available in the registerFile.\n", __func__);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, update_value, enter_tick);
                 stats.registerShortage++;
             }
         } else {
+            DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__,
+                update_addr, update_addr, std::get<1>(registerFile[update_addr]));
             RegisterState state = std::get<0>(registerFile[update_addr]);
             if (state == RegisterState::PENDING_WRITE) {
                 // NOTE: If it's pending write, let it be written.
-                tempQueue.pop_front();
-                tempQueue.emplace_back(update_addr, update_value, enter_tick);
+                DPRINTF(WLEngine, "%s: Respective register for addr: "
+                        "%lu is pending a write to the cache. Rolling "
+                        "the update.\n", __func__, update_addr);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, update_value, enter_tick);
             } else {
-                DPRINTF(WLEngine,  "%s: A register has already been allocated for "
-                            "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
-                        __func__, update_addr, update_addr, std::get<1>(registerFile[update_addr]));
                 uint32_t curr_value = std::get<1>(registerFile[update_addr]);
                 uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
                 registerFile[update_addr] = std::make_tuple(state, new_value);
@@ -284,7 +291,8 @@ WLEngine::processNextReadEvent()
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
                 stats.registerFileCoalesce++;
-                tempQueue.pop_front();
+                temp_queue.pop_front();
+                num_popped++;
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
@@ -295,30 +303,28 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                checkRetryReq();
             }
         }
 
+        num_tries++;
         if (num_reads >= maxReadsPerCycle) {
             stats.numReadPortShortage++;
             break;
         }
-        if (num_tries > maxUpdatesProcessed) {
+        if (num_tries >= max_visits) {
             break;
         }
-
-        if (tempQueue.empty()) {
+        if (temp_queue.empty()) {
             break;
         }
     }
 
-    for (int i = 0; i < tempQueue.size(); i++){
-        updateQueue.push_front(tempQueue.back());
-        tempQueue.pop_back();
+    while (!temp_queue.empty()) {
+        updateQueue.push_front(temp_queue.back());
+        temp_queue.pop_back();
     }
-
-    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
-        schedule(nextReduceEvent, nextCycle());
+    if (num_popped > 0) {
+        checkRetryReq();
     }
     if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());

From 850d4526a1e0347621db9ca1bbedf23ce92ce031 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Apr 2023 22:21:26 -0700
Subject: [PATCH 254/279] Fixing a typo.

---
 src/accl/graph/sega/wl_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0c96689a5a..4f23d65d32 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -345,7 +345,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                 graphWorkload->printWorkListItem(wl), workListFile.size());
 
-    uint32_t value = std::get<0>(registerFile[addr]);
+    uint32_t value = std::get<1>(registerFile[addr]);
     registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
     toReduce.push_back(addr);
 

From 0180566e3eee1c15e019aa25750140f60eb2e8cf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Apr 2023 14:05:33 -0700
Subject: [PATCH 255/279] Updating wl_engine stats. Adding colaescing to update
 queue.

---
 src/accl/graph/sega/coalesce_engine.cc |  1 -
 src/accl/graph/sega/wl_engine.cc       | 97 ++++++++++++++++----------
 src/accl/graph/sega/wl_engine.hh       | 12 ++--
 3 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dcc7feb3dd..42ae604833 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -858,7 +858,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
-                assert(cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 4f23d65d32..5a4a960635 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -166,22 +166,38 @@ WLEngine::done()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
-    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
-        return false;
+    Addr update_addr = pkt->getAddr();
+    uint32_t update_value = pkt->getLE<uint32_t>();
+
+    if (valueMap.find(update_addr) != valueMap.end()) {
+        assert((updateQueueSize == 0) ||
+                (updateQueue.size() <= updateQueueSize));
+        DPRINTF(WLEngine, "%s: Found an already queued update to %u. ",
+                            "Current value is: %u.\n", __func__,
+                            update_addr, valueMap[update_addr]);
+        valueMap[update_addr] =
+                graphWorkload->reduce(update_value, valueMap[update_addr]);
+        stats.numIncomingUpdates++;
+        stats.updateQueueCoalescions++;
+    } else {
+        assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
+        if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+            return false;
+        } else {
+            updateQueue.emplace_back(update_addr, curTick());
+            valueMap[update_addr] = update_value;
+            stats.numIncomingUpdates++;
+            DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+            DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+        }
     }
 
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
-    stats.numberIncomingUpdaes++;
-    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-
     // delete the packet since it's not needed anymore.
     delete pkt;
 
@@ -194,10 +210,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
 void
 WLEngine::processNextReadEvent()
 {
-    int num_reads = 0;
-    int num_popped = 0;
-    int num_tries = 0;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> temp_queue;
+    std::deque<std::tuple<Addr, Tick>> temp_queue;
     for (int i = 0; i < maxUpdatesProcessed; i++) {
         if (updateQueue.empty()) {
             break;
@@ -206,17 +219,18 @@ WLEngine::processNextReadEvent()
         updateQueue.pop_front();
     }
 
+    int num_reads = 0;
+    int num_popped = 0;
+    int num_tries = 0;
     int max_visits = temp_queue.size();
-
     while (true) {
         Addr update_addr;
-        uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = temp_queue.front();
+        std::tie(update_addr, enter_tick) = temp_queue.front();
 
+        uint32_t update_value = valueMap[update_addr];
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
-
         if ((registerFile.find(update_addr) == registerFile.end())) {
             DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
                                 "in registerFile.\n", __func__, update_addr);
@@ -237,6 +251,7 @@ WLEngine::processNextReadEvent()
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
                     temp_queue.pop_front();
+                    valueMap.erase(update_addr);
                     num_reads++;
                     num_popped++;
                     stats.updateQueueLatency.sample(
@@ -253,13 +268,13 @@ WLEngine::processNextReadEvent()
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
                         temp_queue.pop_front();
-                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        temp_queue.emplace_back(update_addr, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
                         stats.numUpdateRolls++;
                     } else {
                         temp_queue.pop_front();
-                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        temp_queue.emplace_back(update_addr, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject with no roll "
                         "from cache. Rolling the update anyway.\n", __func__);
                     }
@@ -268,7 +283,7 @@ WLEngine::processNextReadEvent()
                 DPRINTF(WLEngine, "%s: There are no free registers "
                         "available in the registerFile.\n", __func__);
                 temp_queue.pop_front();
-                temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                temp_queue.emplace_back(update_addr, enter_tick);
                 stats.registerShortage++;
             }
         } else {
@@ -282,7 +297,7 @@ WLEngine::processNextReadEvent()
                         "%lu is pending a write to the cache. Rolling "
                         "the update.\n", __func__, update_addr);
                 temp_queue.pop_front();
-                temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                temp_queue.emplace_back(update_addr, enter_tick);
             } else {
                 uint32_t curr_value = std::get<1>(registerFile[update_addr]);
                 uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
@@ -290,8 +305,9 @@ WLEngine::processNextReadEvent()
                 DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
-                stats.registerFileCoalesce++;
+                stats.registerFileCoalescions++;
                 temp_queue.pop_front();
+                valueMap.erase(update_addr);
                 num_popped++;
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
@@ -308,7 +324,9 @@ WLEngine::processNextReadEvent()
 
         num_tries++;
         if (num_reads >= maxReadsPerCycle) {
-            stats.numReadPortShortage++;
+            if (!temp_queue.empty()) {
+                stats.numReadPortShortage++;
+            }
             break;
         }
         if (num_tries >= max_visits) {
@@ -370,11 +388,14 @@ WLEngine::processNextReduceEvent()
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
         num_reduces++;
+        stats.numReductions++;
         toReduce.pop_front();
         toWrite.push_back(addr);
 
         if (num_reduces >= maxReducesPerCycle) {
-            // TODO: Add stat to count reducer shortage;
+            if (!toReduce.empty()) {
+                stats.numReducerShortage++;
+            }
             break;
         }
         if (toReduce.empty()) {
@@ -404,7 +425,9 @@ WLEngine::processNextWriteEvent()
         toWrite.pop_front();
         num_writes++;
         if (num_writes >= maxWritesPerCycle) {
-            stats.numWritePortShortage++;
+            if (!toWrite.empty()) {
+                stats.numWritePortShortage++;
+            }
             break;
         }
         if (toWrite.empty()) {
@@ -432,10 +455,8 @@ WLEngine::processNextDoneSignalEvent()
 WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     : statistics::Group(&_wl),
     wl(_wl),
-    ADD_STAT(numReduce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
-    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
              "stalled because of register shortage"),
@@ -444,9 +465,15 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "to the back of the update queue due to cache reject."),
     ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
              "Number of times limited by read per cycle."),
+    ADD_STAT(registerFileCoalescions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReductions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReducerShortage, statistics::units::Count::get(),
+             "Number of times limited by number of reducers."),
     ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
              "Number of times limited by write per cycle."),
-    ADD_STAT(numberIncomingUpdaes, statistics::units::Count::get(),
+    ADD_STAT(numIncomingUpdates, statistics::units::Count::get(),
               "Number of inocoming updates for each GPT."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 8f55ecadd4..bb8e82f501 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -77,7 +77,8 @@ class WLEngine : public BaseReduceEngine
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
+    std::deque<std::tuple<Addr, Tick>> updateQueue;
+    std::unordered_map<Addr, uint32_t> valueMap;
 
     int maxReadsPerCycle;
     int maxReducesPerCycle;
@@ -112,14 +113,15 @@ class WLEngine : public BaseReduceEngine
       void regStats() override;
 
       WLEngine &wl;
-
-      statistics::Scalar numReduce;
-      statistics::Scalar registerFileCoalesce;
+      statistics::Scalar updateQueueCoalescions;
       statistics::Scalar registerShortage;
       statistics::Scalar numUpdateRolls;
       statistics::Scalar numReadPortShortage;
+      statistics::Scalar registerFileCoalescions;
+      statistics::Scalar numReductions;
+      statistics::Scalar numReducerShortage;
       statistics::Scalar numWritePortShortage;
-      statistics::Scalar numberIncomingUpdaes;
+      statistics::Scalar numIncomingUpdates;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From 121c8242836a81cd6b39e37812c17d86aa5e5c05 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 7 Apr 2023 08:56:37 -0700
Subject: [PATCH 256/279] Adding number of transitions.

---
 src/accl/graph/sega/CoalesceEngine.py  |   1 +
 src/accl/graph/sega/coalesce_engine.cc | 209 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |  21 +--
 3 files changed, 128 insertions(+), 103 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 25f8a1c58b..bb45802c1d 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -47,3 +47,4 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
+    transitions_per_cycle = Param.Int("Max number of transitions in a cycle")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 42ae604833..a2653952e0 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,9 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0), pendingPullLimit(params.pending_pull_limit),
+    numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit),
     pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
+    transitionsPerCycle(params.transitions_per_cycle),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -68,8 +69,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
-    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    numActiveBlocksNow = UniqueFIFO<int>(numLines);
+    numActiveBlocksNext = UniqueFIFO<int>(numLines);
 
     activeBuffer.clear();
     postPushWBQueue.clear();
@@ -142,10 +143,10 @@ CoalesceEngine::postConsumeProcess()
                 }
             }
             if (!atom_active_future_before && atom_active_future_after) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
             if (atom_active_future_before && !atom_active_future_after) {
-                futureActiveCacheBlocks.erase(block_index);
+                numActiveBlocksNext.erase(block_index);
             }
         } else {
             WorkListItem items[numElementsPerLine];
@@ -199,35 +200,35 @@ void
 CoalesceEngine::swapDirectories()
 {
     assert(currentDirectory->empty());
-    assert(currentActiveCacheBlocks.empty());
+    assert(numActiveBlocksNow.empty());
     // assert currentDirectory is empty
     WorkDirectory* temp = currentDirectory;
     currentDirectory = futureDirectory;
     futureDirectory = temp;
 
-    currentActiveCacheBlocks.clear();
-    currentActiveCacheBlocks = futureActiveCacheBlocks;
-    futureActiveCacheBlocks.clear();
+    numActiveBlocksNow.clear();
+    numActiveBlocksNow = numActiveBlocksNext;
+    numActiveBlocksNext.clear();
 }
 
 bool
 CoalesceEngine::done()
 {
-    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+    return memAccBuffer.empty() && numActiveBlocksNow.empty() &&
         activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
 }
 
 bool
 CoalesceEngine::enoughSpace()
 {
-    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
+    return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize;
 }
 
 bool
 CoalesceEngine::pullCondition()
 {
     bool enough_space = enoughSpace();
-    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    bool schedule_limit = numScheduledPulls < pendingPullLimit;
     return enough_space && schedule_limit;
 }
 
@@ -341,7 +342,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                                         __func__, block_index);
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
+                    memAccBuffer.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
                             processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
@@ -366,18 +367,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                     if (atom_active_now) {
                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
                                                         __func__, block_index);
-                        currentActiveCacheBlocks.erase(block_index);
+                        numActiveBlocksNow.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                         stats.currentFrontierSize.sample(currentDirectory->workCount());
-                        stats.currentBlockActiveCount.sample(count);
+                        stats.countActiveBlocksNow.sample(count);
                     }
                     if (atom_active_future) {
                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
                                                         __func__, block_index);
-                        futureActiveCacheBlocks.erase(block_index);
+                        numActiveBlocksNext.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                         stats.futureFrontierSize.sample(futureDirectory->workCount());
-                        stats.futureBlockActiveCount.sample(count);
+                        stats.countActiveBlocksNext.sample(count);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -405,7 +406,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             cacheBlocks[block_index].lastChangedTick = curTick();
 
             MSHR[block_index].push_back(addr);
-            memoryFunctionQueue.emplace_back(
+            memAccBuffer.emplace_back(
                 [this] (int block_index, Tick schedule_tick) {
                     processNextRead(block_index, schedule_tick);
                 }, block_index, curTick());
@@ -492,15 +493,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             }
             if (atom_active_now) {
                 int count = currentDirectory->deactivate(addr);
-                currentActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNow.push_back(block_index);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
             }
             if (atom_active_future) {
                 int count = futureDirectory->deactivate(addr);
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
                 stats.futureFrontierSize.sample(futureDirectory->workCount());
-                stats.futureBlockActiveCount.sample(count);
+                stats.countActiveBlocksNext.sample(count);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -560,11 +561,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (atom_active_now) {
                 int count = currentDirectory->deactivate(addr);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
                 if (atom_active_future) {
                     int count = futureDirectory->deactivate(addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
@@ -573,15 +574,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             }
 
             if (pullCondition()) {
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
                         processNextVertexPull(ignore, schedule_tick);
-                    }, 0, curTick());
+                    }, -1, curTick());
                 if ((!nextMemoryEvent.pending()) &&
                     (!nextMemoryEvent.scheduled())) {
                     schedule(nextMemoryEvent, nextCycle());
                 }
-                pullsScheduled++;
+                numScheduledPulls++;
             }
         }
         delete purpose;
@@ -681,8 +682,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (mode == ProcessingMode::ASYNCHRONOUS) {
         cacheBlocks[block_index].items[wl_offset].activeNow |= active;
-        if (active && (!currentActiveCacheBlocks.find(block_index))) {
-            currentActiveCacheBlocks.push_back(block_index);
+        if (active && (!numActiveBlocksNow.find(block_index))) {
+            numActiveBlocksNow.push_back(block_index);
             if (!owner->running()) {
                 owner->start();
             }
@@ -690,8 +691,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
         cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
-        if (active && (!futureActiveCacheBlocks.find(block_index))) {
-            futureActiveCacheBlocks.push_back(block_index);
+        if (active && (!numActiveBlocksNext.find(block_index))) {
+            numActiveBlocksNext.push_back(block_index);
         }
     }
 
@@ -709,7 +710,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             if (cacheBlocks[block_index].dirty) {
                 cacheBlocks[block_index].state = CacheState::PENDING_WB;
                 cacheBlocks[block_index].lastChangedTick = curTick();
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextWriteBack(block_index, schedule_tick);
                     }, block_index, curTick());
@@ -725,16 +726,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
                 if (atom_active_now) {
-                    currentActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNow.erase(block_index);
                     int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                     stats.currentFrontierSize.sample(currentDirectory->workCount());
-                    stats.currentBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNow.sample(count);
                 }
                 if (atom_active_future) {
-                    futureActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNext.erase(block_index);
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -756,32 +757,52 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextMemoryEvent()
 {
-    if (memPort.blocked()) {
-        stats.numMemoryBlocks++;
-        nextMemoryEvent.sleep();
-        return;
+    int num_transitions = 0;
+    std::unordered_set<int> transitions;
+    FunctionDeque temp_deque;
+    temp_deque.clear();
+
+    while (true) {
+        if (memPort.blocked()) {
+            while (!temp_deque.empty()) {
+                memAccBuffer.push_front(temp_deque.back());
+                temp_deque.pop_back();
+            }
+            stats.numMemoryBlocks++;
+            nextMemoryEvent.sleep();
+            return;
+        }
+        DPRINTF(CoalesceEngine, "%s: Processing another "
+                            "memory function.\n", __func__);
+        std::function<void(int, Tick)> function;
+        int input;
+        Tick tick;
+        std::tie(function, input, tick) = memAccBuffer.front();
+        if ((transitions.find(input) == transitions.end()) || (input == -1)) {
+            function(input, tick);
+            memAccBuffer.pop_front();
+            transitions.insert(input);
+            stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency());
+            DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. "
+                    "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size());
+            num_transitions++;
+        } else {
+            temp_deque.emplace_back(function, input, tick);
+            memAccBuffer.pop_front();
+        }
+        if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) {
+            break;
+        }
     }
 
-    DPRINTF(CoalesceEngine, "%s: Processing another "
-                        "memory function.\n", __func__);
-    std::function<void(int, Tick)> next_memory_function;
-    int next_memory_function_input;
-    Tick next_memory_function_tick;
-    std::tie(
-        next_memory_function,
-        next_memory_function_input,
-        next_memory_function_tick) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input, next_memory_function_tick);
-    memoryFunctionQueue.pop_front();
-    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
-                                                * 1e9 / getClockFrequency());
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
+    while (!temp_deque.empty()) {
+        memAccBuffer.push_front(temp_deque.back());
+        temp_deque.pop_back();
+    }
 
     assert(!nextMemoryEvent.pending());
     assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
+    if ((!memAccBuffer.empty())) {
         schedule(nextMemoryEvent, nextCycle());
     }
 
@@ -830,7 +851,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
 
             need_send_pkt = false;
@@ -852,7 +873,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].lastChangedTick = curTick();
             // If an atom is in the activeBuffer,
             // then it is definitely currently active.
-            currentActiveCacheBlocks.push_back(block_index);
+            numActiveBlocksNow.push_back(block_index);
             // NOTE: Residence in the activeBuffer does not
             // signify anything about future activity.
             bool atom_active_future = false;
@@ -861,18 +882,18 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
 
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
             delete ab_pkt;
             if (pullCondition()) {
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
                         processNextVertexPull(ignore, schedule_tick);
-                    }, 0, curTick());
-                pullsScheduled++;
+                    }, -1, curTick());
+                numScheduledPulls++;
             }
         } else {
             ab++;
@@ -966,20 +987,20 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         if (atom_active_future) {
-            futureActiveCacheBlocks.erase(block_index);
+            numActiveBlocksNext.erase(block_index);
         }
         if (atom_active_now) {
-            currentActiveCacheBlocks.erase(block_index);
+            numActiveBlocksNow.erase(block_index);
             if (enoughSpace()) {
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
                 if (atom_active_future) {
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
@@ -988,7 +1009,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
             if (atom_active_future) {
                 int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                 stats.futureFrontierSize.sample(futureDirectory->workCount());
-                stats.futureBlockActiveCount.sample(count);
+                stats.countActiveBlocksNext.sample(count);
             }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
@@ -1033,7 +1054,7 @@ void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
-    pullsScheduled--;
+    numScheduledPulls--;
     if (!currentDirectory->empty()) {
         Addr addr = currentDirectory->getNextWork();
         int block_index = getBlockIndex(addr);
@@ -1081,14 +1102,14 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+    return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
 CoalesceEngine::recvVertexPull()
 {
-    pullsReceived++;
-    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
+    numReceivedPulls++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls);
 
     stats.verticesPulled++;
     stats.lastVertexPullTime = curTick() - stats.lastResetTick;
@@ -1109,14 +1130,14 @@ CoalesceEngine::processNextApplyEvent()
         std::tie(pkt, entrance_tick) = activeBuffer.front();
         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
 
-        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+        for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
             if (items[index].activeNow) {
                 Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
                 uint32_t delta = graphWorkload->apply(items[index]);
                 items[index].activeNow = false;
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
-                pullsReceived--;
+                numReceivedPulls--;
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
@@ -1135,23 +1156,23 @@ CoalesceEngine::processNextApplyEvent()
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
             activeBuffer.pop_front();
-            memoryFunctionQueue.emplace_back(
+            memAccBuffer.emplace_back(
                 [this] (int ignore, Tick schedule_tick) {
                     processNextPostPushWB(ignore, schedule_tick);
-                }, 0, curTick());
+                }, -1, curTick());
             if ((!nextMemoryEvent.pending()) &&
                 (!nextMemoryEvent.scheduled())) {
                 schedule(nextMemoryEvent, nextCycle());
             }
             delete pkt;
         }
-    } else if (!currentActiveCacheBlocks.empty()) {
+    } else if (!numActiveBlocksNow.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = currentActiveCacheBlocks.size();
+        int initial_fifo_length = numActiveBlocksNow.size();
         while (true) {
-            int block_index = currentActiveCacheBlocks.front();
+            int block_index = numActiveBlocksNow.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
-                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
                     if (cacheBlocks[block_index].items[index].activeNow) {
                         Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
                         uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
@@ -1160,7 +1181,7 @@ CoalesceEngine::processNextApplyEvent()
                         owner->recvVertexPush(addr, delta,
                             cacheBlocks[block_index].items[index].edgeIndex,
                             cacheBlocks[block_index].items[index].degree);
-                        pullsReceived--;
+                        numReceivedPulls--;
                         stats.verticesPushed++;
                         stats.lastVertexPushTime = curTick() - stats.lastResetTick;
                     }
@@ -1172,14 +1193,14 @@ CoalesceEngine::processNextApplyEvent()
                 }
                 // NOTE: If we have reached the last item in the cache block
                 if (!atom_active_now) {
-                    currentActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNow.erase(block_index);
                 }
                 break;
             }
             // NOTE: If the block with index at the front of activeCacheBlocks
             // is not in IDLE state, then roll the that index to the back
-            currentActiveCacheBlocks.pop_front();
-            currentActiveCacheBlocks.push_back(block_index);
+            numActiveBlocksNow.pop_front();
+            numActiveBlocksNow.push_back(block_index);
             // NOTE: If we have visited all the items initially in the FIFO.
             num_visited_indices++;
             if (num_visited_indices == initial_fifo_length) {
@@ -1192,18 +1213,18 @@ CoalesceEngine::processNextApplyEvent()
     }
 
     if (pullCondition()) {
-        memoryFunctionQueue.emplace_back(
+        memAccBuffer.emplace_back(
             [this] (int ignore, Tick schedule_tick) {
                 processNextVertexPull(ignore, schedule_tick);
-            }, 0, curTick());
+            }, -1, curTick());
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
         }
-        pullsScheduled++;
+        numScheduledPulls++;
     }
 
-    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+    if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
 }
@@ -1261,13 +1282,13 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Histogram of the length of the current bitvector."),
     ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
              "Histogram of the length of the future bitvector."),
-    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+    ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(),
              "Histogram of the popCount values in the current directory"),
-    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+    ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(),
              "Histogram of the popCount values in the future directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
-    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+    ADD_STAT(memAccBufferLat, statistics::units::Second::get(),
              "Histogram of the latency of processing a memory function.")
 {
 }
@@ -1286,10 +1307,10 @@ CoalesceEngine::CoalesceStats::regStats()
 
     currentFrontierSize.init(64);
     futureFrontierSize.init(64);
-    currentBlockActiveCount.init(64);
-    futureBlockActiveCount.init(64);
+    countActiveBlocksNow.init(64);
+    countActiveBlocksNext.init(64);
     responseQueueLatency.init(64);
-    memoryFunctionLatency.init(64);
+    memAccBufferLat.init(64);
 }
 
 void
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f01475118a..4066c7dbe5 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,6 +43,8 @@
 namespace gem5
 {
 
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> FunctionDeque;
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -120,12 +122,12 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     // Tracking work in cache
-    int pullsReceived;
+    int numReceivedPulls;
     // NOTE: Remember to erase from these upon eviction from cache
-    UniqueFIFO<int> currentActiveCacheBlocks;
-    UniqueFIFO<int> futureActiveCacheBlocks;
+    UniqueFIFO<int> numActiveBlocksNow;
+    UniqueFIFO<int> numActiveBlocksNext;
 
-    int pullsScheduled;
+    int numScheduledPulls;
     int pendingPullLimit;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
@@ -141,14 +143,15 @@ class CoalesceEngine : public BaseMemoryEngine
     bool pullCondition();
     int getBlockIndex(Addr addr);
 
+    int transitionsPerCycle;
+    FunctionDeque memAccBuffer;
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
     void processNextVertexPull(int ignore, Tick schedule_tick);
     void processNextPostPushWB(int ignore, Tick schedule_tick);
-    std::deque<std::tuple<
-        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
@@ -192,10 +195,10 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram currentFrontierSize;
         statistics::Histogram futureFrontierSize;
-        statistics::Histogram currentBlockActiveCount;
-        statistics::Histogram futureBlockActiveCount;
+        statistics::Histogram countActiveBlocksNow;
+        statistics::Histogram countActiveBlocksNext;
         statistics::Histogram responseQueueLatency;
-        statistics::Histogram memoryFunctionLatency;
+        statistics::Histogram memAccBufferLat;
     };
 
     CoalesceStats stats;

From 25b6f1fe1bfa31139421224227d2c018998dbe5b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 9 Apr 2023 18:47:46 -0700
Subject: [PATCH 257/279] Improving the performance of pushEngine.

---
 src/accl/graph/sega/push_engine.cc | 45 ++++++++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh |  2 +-
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4703e27d16..3279fb9450 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -295,6 +295,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
         metaEdgeQueue.emplace_back(meta_edge, curTick());
         stats.edgeQueueLength.sample(metaEdgeQueue.size());
     }
+    stats.edgeQueueLength.sample(metaEdgeQueue.size());
     stats.numWastefulEdgesRead +=
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
@@ -313,38 +314,57 @@ void
 PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
+    int num_tries = 0;
+    int num_reads = 0;
+    std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
+    for (int i = 0; i < maxPropagatesPerCycle; i++) {
+        if (metaEdgeQueue.empty()) {
+            break;
+        }
+        temp_edge.push_back(metaEdgeQueue.front());
+        metaEdgeQueue.pop_front();
+    }
+    int max_visits = temp_edge.size();
+
     while(true) {
         MetaEdge meta_edge;
         Tick entrance_tick;
-        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
+        std::tie(meta_edge, entrance_tick) = temp_edge.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
         uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
-        metaEdgeQueue.pop_front();
+        temp_edge.pop_front();
+        num_tries++;
 
         if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
+            num_reads++;
             stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
-            stats.edgeQueueLength.sample(metaEdgeQueue.size());
         } else {
-            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
+            temp_edge.emplace_back(meta_edge, entrance_tick);
+            stats.updateQueueFull++;
         }
         num_propagates++;
 
-        if (metaEdgeQueue.empty()) {
+        if (temp_edge.empty()) {
             break;
         }
-        if (num_propagates >= maxPropagatesPerCycle) {
+        if (num_tries >= max_visits) {
             break;
         }
     }
 
+    while (!temp_edge.empty()) {
+        metaEdgeQueue.push_front(temp_edge.back());
+        temp_edge.pop_back();
+    }
+
     stats.numPropagatesHist.sample(num_propagates);
 
     assert(!nextPropagateEvent.scheduled());
@@ -370,6 +390,11 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
 
     assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
 
+    int num_updates = 0;
+    for (auto queue: destinationQueues) {
+        num_updates += queue.size();
+    }
+
     if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
         DPRINTF(PushEngine, "%s: Found an existing update "
                             "for dst: %lu.\n", __func__, dst);
@@ -385,7 +410,7 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
                             prev_src, dst, new_val);
         stats.updateQueueCoalescions++;
         return true;
-    } else if (destinationQueues[port_id].size() < updateQueueSize) {
+    } else if (num_updates < (updateQueueSize * destinationQueues.size())) {
         DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue for port %d.\n", __func__, port_id);
         destinationQueues[port_id].emplace_back(dst, curTick());
@@ -401,6 +426,8 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
         }
         return true;
     }
+    DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n",
+                            __func__, port_id);
     return false;
 }
 
@@ -468,6 +495,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     push(_push),
     ADD_STAT(numPropagates, statistics::units::Count::get(),
              "Number of propagate operations done."),
+    ADD_STAT(updateQueueFull, statistics::units::Count::get(),
+             "Number of times the update queue returns false."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
     // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
@@ -508,7 +537,7 @@ PushEngine::PushStats::regStats()
     edgeQueueLatency.init(64);
     edgeQueueLength.init(64);
     updateQueueLength.init(64);
-    numPropagatesHist.init(push.params().max_propagates_per_cycle);
+    numPropagatesHist.init(1 + push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9f489455ac..08a5d278f5 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -172,8 +172,8 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Scalar numMemoryBlocks;
       statistics::Scalar numPropagates;
+      statistics::Scalar updateQueueFull;
       statistics::Scalar numNetBlocks;
-    //   statistics::Scalar numIdleCycles;
       statistics::Scalar updateQueueCoalescions;
       statistics::Scalar numUpdates;
       statistics::Scalar numWastefulEdgesRead;

From 7521e84614d2869916db0e188cd02e8298a791f5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Apr 2023 22:18:41 -0700
Subject: [PATCH 258/279] Initial commit for PG.

---
 configs/accl/bfs.py                        |   3 +-
 configs/accl/sega.py                       |  31 ++-
 src/accl/graph/base/data_structs.hh        |  35 ++++
 src/accl/graph/sega/CenteralController.py  |   4 +
 src/accl/graph/sega/centeral_controller.cc | 225 ++++++++++++++++++++-
 src/accl/graph/sega/centeral_controller.hh | 106 +++++++++-
 src/accl/graph/sega/coalesce_engine.cc     |   2 +-
 src/accl/graph/sega/enums.cc               |   3 +
 src/accl/graph/sega/enums.hh               |  10 +
 src/accl/graph/sega/wl_engine.cc           |   5 +
 10 files changed, 418 insertions(+), 6 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 97f1b5dc21..7035b2a535 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -111,7 +111,8 @@ def get_inputs():
 
     m5.instantiate()
 
-    system.set_async_mode()
+    # system.set_async_mode()
+    system.set_pg_mode()
     system.create_pop_count_directory(64)
     if visited:
         system.create_bfs_visited_workload(init_addr, init_value)
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 58a8caddde..ca1f4b9381 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -137,7 +137,13 @@ def setPort(self, port):
 
 
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
         super(SEGA, self).__init__()
         # num_gpts should be an even power of 2
         assert num_gpts != 0
@@ -151,8 +157,26 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.mem_mode = "timing"
 
         # Building the CenteralController
+        self.mirror_mem = SimpleMemory(
+            latency="90ns",
+            latency_var="0ns",
+            bandwidth="28GiB/s",
+            image_file=f"{graph_path}/mirrors",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.map_mem = SimpleMemory(
+            latency="90ns",
+            latency_var="0ns",
+            bandwidth="28GiB/s",
+            image_file=f"{graph_path}/mirrors_map",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
         self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices"
+            vertex_image_file=f"{graph_path}/vertices",
+            mirrors_mem=self.mirror_mem.port,
+            mirrors_map_mem=self.map_mem.port,
         )
         # Building the EdgeMemories
         edge_mem = []
@@ -193,6 +217,9 @@ def set_async_mode(self):
     def set_bsp_mode(self):
         self.ctrl.setBSPMode()
 
+    def set_pg_mode(self):
+        self.ctrl.setPGMode()
+
     def create_pop_count_directory(self, atoms_per_block):
         self.ctrl.createPopCountDirectory(atoms_per_block)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 60391b3a7c..f1a26f6ac2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -94,8 +94,43 @@ struct __attribute__ ((packed)) Edge
     {}
 };
 
+struct __attribute__ ((packed)) MirrorVertex
+{
+    uint32_t vertexId : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeNext: 1;
+
+    std::string to_string()
+    {
+        return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeNext: %s}",
+                        vertexId, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeNext ? "true" : "false");
+    }
+    MirrorVertex():
+        vertexId(-1),
+        prop(-1),
+        edgeIndex(-1),
+        degree(-1),
+        activeNow(false),
+        activeNext(false)
+    {}
+
+    MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree,
+                uint32_t edge_index, bool active_now, bool active_next):
+                vertexId(vertex_id), prop(prop), edgeIndex(edge_index),
+                degree(degree), activeNow(active_now), activeNext(active_next)
+    {}
+
+};
+
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
+static_assert(isPowerOf2(sizeof(MirrorVertex)));
 
 struct MetaEdge {
     uint64_t src;
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index c5f44c82e9..c5187ba6ec 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -35,6 +35,9 @@ class CenteralController(ClockedObject):
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
+    mirrors_mem = RequestPort("Port to a memory storing vertex mirrors file.")
+    mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
+
     system = Param.System(Parent.any, "System this Engine is a part of")
 
     vertex_image_file = Param.String("Path to the vertex image file.")
@@ -44,6 +47,7 @@ class CenteralController(ClockedObject):
     cxx_exports = [
                     PyBindMethod("setAsyncMode"),
                     PyBindMethod("setBSPMode"),
+                    PyBindMethod("setPGMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createBFSVisitedWorkload"),
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0aee3b77ce..71b38edda8 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,7 +43,13 @@ namespace gem5
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
-    mode(ProcessingMode::NOT_SET)
+    mirrorsPort("mirrors_mem", this, 0), mapPort("map_port", this, 1),
+    mode(ProcessingMode::NOT_SET), currentSliceNumber(0), totalSliceNumber(148),
+    lastReadPacketId(0),
+    nextMirrorMapReadEvent([this] { processNextMirrorMapReadEvent(); }, name()),
+    nextMirrorReadEvent([this] { processNextMirrorReadEvent(); }, name()),
+    nextMirrorUpdateEvent([this] { processNextMirrorUpdateEvent(); }, name()),
+    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name())
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -51,6 +57,18 @@ CenteralController::CenteralController(const Params& params):
     }
 }
 
+Port&
+CenteralController::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "mirrors_mem") {
+        return mirrorsPort;
+    } else if (if_name == "mirrors_map_mem") {
+        return mapPort;
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
@@ -108,6 +126,11 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
             mpu->createBSPPopCountDirectory(atoms_per_block);
         }
     }
+    if (mode == ProcessingMode::POLY_GRAPH) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
 }
 
 void
@@ -152,6 +175,45 @@ CenteralController::startup()
     workload->iterate();
 }
 
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s is blocked.\n", __func__, _id, pkt->print());
+        blockedPacket = pkt;
+    } else {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s sent.\n", __func__, _id, pkt->print());
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    DPRINTF(CenteralController, "%s: Port %d received pkt: %s.\n", __func__, _id, pkt->print());
+    return owner->handleMemResp(pkt, _id);
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(CenteralController, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(CenteralController, "%s: blockedPacket sent successfully.\n", __func__);
+        owner->recvReqRetry(_id);
+    }
+}
+
 PacketPtr
 CenteralController::createReadPacket(Addr addr, unsigned int size)
 {
@@ -190,8 +252,169 @@ CenteralController::recvDoneSignal()
         workload->iterate();
         exitSimLoopNow("finished an iteration.");
     }
+
+    if (done && mode == ProcessingMode::POLY_GRAPH) {
+        // assert(!nextMirrorMapReadEvent.scheduled());
+        if (!nextMirrorMapReadEvent.scheduled()) {
+            schedule(nextMirrorMapReadEvent, nextCycle());
+        }
+    }
 }
 
+void
+CenteralController::processNextMirrorMapReadEvent()
+{
+    // TODO: In future add functionality to align start_addr and end_addr to
+    // size of the vertex atom.
+    Addr start_addr = currentSliceNumber * totalSliceNumber * sizeof(int);
+    Addr end_addr = start_addr + totalSliceNumber * sizeof(int);
+    PacketPtr start = createReadPacket(start_addr, sizeof(int));
+    PointerTag* start_tag = new PointerTag(lastReadPacketId, PointerType::START);
+    start->pushSenderState(start_tag);
+    PacketPtr end = createReadPacket(end_addr, sizeof(int));
+    PointerTag* end_tag = new PointerTag(lastReadPacketId, PointerType::END);
+    end->pushSenderState(end_tag);
+    lastReadPacketId++;
+    mapPort.sendPacket(start);
+    mapPort.sendPacket(end);
+}
+
+bool
+CenteralController::handleMemResp(PacketPtr pkt, PortID id)
+{
+    assert(pkt->isResponse());
+    if (id == 0) {
+        if (pkt->isWrite()) {
+            delete pkt;
+            return true;
+        }
+        assert(reqInfoMap.find(pkt->req) != reqInfoMap.end());
+        Addr offset;
+        int num_mirrors;
+        int pkt_size_in_mirrors = pkt->getSize() / sizeof(MirrorVertex);
+        MirrorVertex data[pkt_size_in_mirrors];
+        pkt->writeDataToBlock((uint8_t*) data, pkt->getSize());
+
+        std::tie(offset, num_mirrors) = reqInfoMap[pkt->req];
+        assert(num_mirrors > 0);
+        offset = (int) (offset / sizeof(MirrorVertex));
+        for (int i = 0; i < num_mirrors; i++) {
+            mirrorQueue.push_back(data[i + offset]);
+        }
+        delete pkt;
+
+        if (!nextMirrorUpdateEvent.scheduled()) {
+            schedule(nextMirrorUpdateEvent, nextCycle());
+        }
+        return true;
+    } else if (id == 1) {
+        PointerTag* tag = pkt->findNextSenderState<PointerTag>();
+        int read_id = tag->Id();
+        PointerType read_type = tag->type();
+        if (read_type == PointerType::START) {
+            assert(startAddrs.find(read_id) == startAddrs.end());
+            startAddrs[read_id] = pkt->getLE<int>();
+            if (endAddrs.find(read_id) != endAddrs.end()) {
+                int vertex_atom = mpuVector.front()->vertexAtomSize();
+                mirrorPointerQueue.emplace_back(
+                    startAddrs[read_id], endAddrs[read_id],
+                    sizeof(MirrorVertex), vertex_atom);
+                if (!nextMirrorReadEvent.scheduled()) {
+                    schedule(nextMirrorReadEvent, nextCycle());
+                }
+            }
+        } else {
+            assert(read_type == PointerType::END);
+            assert(endAddrs.find(read_id) == endAddrs.end());
+            endAddrs[read_id] = pkt->getLE<int>();
+            if (startAddrs.find(read_id) != startAddrs.end()) {
+                int vertex_atom = mpuVector.front()->vertexAtomSize();
+                mirrorPointerQueue.emplace_back(
+                    startAddrs[read_id], endAddrs[read_id],
+                    sizeof(MirrorVertex), vertex_atom);
+                if (!nextMirrorReadEvent.scheduled()) {
+                    schedule(nextMirrorReadEvent, nextCycle());
+                }
+            }
+        }
+        DPRINTF(CenteralController, "%s: Received pkt: %s from port %d "
+                                    "with value: %d.\n", __func__,
+                                    pkt->print(), id, pkt->getLE<int>());
+        delete tag;
+        delete pkt;
+        return true;
+    } else {
+        panic("did not expect this.");
+    }
+}
+
+void
+CenteralController::recvReqRetry(PortID id) {
+    if (id == 0) {
+        assert(!nextMirrorReadEvent.scheduled());
+        if (!mirrorPointerQueue.empty()) {
+            schedule(nextMirrorReadEvent, nextCycle());
+        }
+    } else if (id == 1) {
+        DPRINTF(CenteralController, "%s: Ignoring reqRetry "
+                            "for port %d.\n", __func__, id);
+    } else {
+        panic("Did not expect the other.");
+    }
+}
+
+void
+CenteralController::processNextMirrorReadEvent()
+{
+    Addr aligned_addr, offset;
+    int num_mirrors;
+
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    MirrorReadInfoGen& front = mirrorPointerQueue.front();
+    std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
+    PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
+    mirrorsPort.sendPacket(pkt);
+    reqInfoMap[pkt->req] = std::make_tuple(offset, num_mirrors);
+    front.iterate();
+    if (front.done()) {
+        mirrorPointerQueue.pop_front();
+    }
+
+    if (!mirrorPointerQueue.empty() && !mirrorsPort.blocked()) {
+        schedule(nextMirrorReadEvent, nextCycle());
+    }
+}
+
+void
+CenteralController::processNextMirrorUpdateEvent()
+{
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    MirrorVertex front = mirrorQueue.front();
+    Addr org_addr = front.vertexId * sizeof(WorkListItem);
+    Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+    int wl_offset = (aligned_org_addr - org_addr) / sizeof(WorkListItem);
+    int num_items = vertex_atom / sizeof(WorkListItem);
+    WorkListItem data[num_items];
+
+    PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+    for (auto mpu: mpuVector) {
+        AddrRangeList range_list = addrRangeListMap[mpu];
+        if (contains(range_list, org_addr)) {
+            mpu->recvFunctional(read_org);
+        }
+    }
+    read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
+    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__, workload->printWorkListItem(data[wl_offset]), front.to_string());
+    std::cout << workload->printWorkListItem(data[wl_offset]) << std::endl;
+    mirrorQueue.pop_front();
+    if (!mirrorQueue.empty()) {
+        schedule(nextMirrorUpdateEvent, nextCycle());
+    }
+}
+
+void
+CenteralController::processNextWriteBackEvent() {}
+
 int
 CenteralController::workCount()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index cce9ac2725..6f69b0aa81 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -36,6 +36,7 @@
 #include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
+#include "base/intmath.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -46,7 +47,79 @@ namespace gem5
 class CenteralController : public ClockedObject
 {
   private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner, PortID id):
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    struct PointerTag : public Packet::SenderState
+    {
+        int _id;
+        PointerType _type;
+        PointerTag(int id, PointerType type): _id(id), _type(type) {}
+        int Id() { return _id; }
+        PointerType type() { return _type; }
+
+    };
+
+    class MirrorReadInfoGen {
+      private:
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+
+      public:
+        MirrorReadInfoGen(Addr start, Addr end, size_t step, size_t atom):
+                        _start(start), _end(end), _step(step), _atom(atom)
+        {}
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (aligned_addr + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
+        bool done() { return (_start >= _end); }
+    };
+
     System* system;
+
+    ReqPort mirrorsPort;
+    ReqPort mapPort;
+
     Addr maxVertexAddr;
 
     ProcessingMode mode;
@@ -54,18 +127,49 @@ class CenteralController : public ClockedObject
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
+    // FIXME: Initialize these two.
+    int currentSliceNumber;
+    int totalSliceNumber;
+    int lastReadPacketId;
+    std::unordered_map<int, Addr> startAddrs;
+    std::unordered_map<int, Addr> endAddrs;
+    // TODO: Set a max size for this queue;
+    std::deque<MirrorReadInfoGen> mirrorPointerQueue;
+    std::unordered_map<RequestPtr, std::tuple<Addr, int>> reqInfoMap;
+
+    std::deque<MirrorVertex> mirrorQueue;
+    std::deque<PacketPtr> writeBackQueue;
+
     PacketPtr createReadPacket(Addr addr, unsigned int size);
 
-  public:
+    bool handleMemResp(PacketPtr pkt, PortID id);
+    void recvReqRetry(PortID id);
 
+    EventFunctionWrapper nextMirrorMapReadEvent;
+    void processNextMirrorMapReadEvent();
+
+    EventFunctionWrapper nextMirrorReadEvent;
+    void processNextMirrorReadEvent();
+
+    EventFunctionWrapper nextMirrorUpdateEvent;
+    void processNextMirrorUpdateEvent();
+
+    EventFunctionWrapper nextWriteBackEvent;
+    void processNextWriteBackEvent();
+
+  public:
     GraphWorkload* workload;
 
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+
     virtual void startup() override;
 
     void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
     void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+    void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
 
     void createPopCountDirectory(int atoms_per_block);
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a2653952e0..083e8d4c37 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -680,7 +680,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if (mode == ProcessingMode::ASYNCHRONOUS) {
+    if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) {
         cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!numActiveBlocksNow.find(block_index))) {
             numActiveBlocksNow.push_back(block_index);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 2f1bc983eb..5b8de3404f 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -62,7 +62,10 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
 {
     "NOT_SET",
     "ASYNCHRONOUS",
+    "POLY_GRAPH",
     "BULK_SYNCHRONOUS"
 };
 
+const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"N/A", "START", "END"};
+
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 4e7d64235e..92e293bec0 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -74,10 +74,20 @@ enum ProcessingMode
     NOT_SET,
     ASYNCHRONOUS,
     BULK_SYNCHRONOUS,
+    POLY_GRAPH,
     NUM_PROCESSING_MODE
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
+enum PointerType
+{
+    NA,
+    START,
+    END,
+    NUM_POINTER_TYPE
+};
+extern const char* pointerTypeStrings[NUM_POINTER_TYPE];
+
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5a4a960635..c294441703 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -384,8 +384,13 @@ WLEngine::processNextReduceEvent()
         Addr addr = toReduce.front();
         assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
         uint32_t update_value = std::get<1>(registerFile[addr]);
+        DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, "
+                            "temp_prop: %s.\n", __func__, addr,
+                            update_value, workListFile[addr].tempProp);
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine, "%s: Reduction result: %s", __func__,
+                graphWorkload->printWorkListItem(workListFile[addr]));
         registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
         num_reduces++;
         stats.numReductions++;

From 5e3a809b122e58c8c4abb2dc76f74cfc889cfd29 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 5 Apr 2023 05:30:11 -0700
Subject: [PATCH 259/279] Fixing typo in centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 71b38edda8..64704510d6 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -183,17 +183,18 @@ CenteralController::ReqPort::sendPacket(PacketPtr pkt)
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
-        DPRINTF(CenteralController, "%s: Port %d: Packet %s is blocked.\n", __func__, _id, pkt->print());
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                "is blocked.\n", __func__, _id, pkt->print());
         blockedPacket = pkt;
     } else {
-        DPRINTF(CenteralController, "%s: Port %d: Packet %s sent.\n", __func__, _id, pkt->print());
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                    "sent.\n", __func__, _id, pkt->print());
     }
 }
 
 bool
 CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
 {
-    DPRINTF(CenteralController, "%s: Port %d received pkt: %s.\n", __func__, _id, pkt->print());
     return owner->handleMemResp(pkt, _id);
 }
 
@@ -209,7 +210,8 @@ CenteralController::ReqPort::recvReqRetry()
     blockedPacket = nullptr;
     sendPacket(pkt);
     if (blockedPacket == nullptr) {
-        DPRINTF(CenteralController, "%s: blockedPacket sent successfully.\n", __func__);
+        DPRINTF(CenteralController, "%s: blockedPacket sent "
+                                "successfully.\n", __func__);
         owner->recvReqRetry(_id);
     }
 }
@@ -392,7 +394,7 @@ CenteralController::processNextMirrorUpdateEvent()
     MirrorVertex front = mirrorQueue.front();
     Addr org_addr = front.vertexId * sizeof(WorkListItem);
     Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-    int wl_offset = (aligned_org_addr - org_addr) / sizeof(WorkListItem);
+    int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
     int num_items = vertex_atom / sizeof(WorkListItem);
     WorkListItem data[num_items];
 
@@ -404,9 +406,10 @@ CenteralController::processNextMirrorUpdateEvent()
         }
     }
     read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
-    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__, workload->printWorkListItem(data[wl_offset]), front.to_string());
-    std::cout << workload->printWorkListItem(data[wl_offset]) << std::endl;
+    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
+            workload->printWorkListItem(data[wl_offset]), front.to_string());
     mirrorQueue.pop_front();
+    delete read_org;
     if (!mirrorQueue.empty()) {
         schedule(nextMirrorUpdateEvent, nextCycle());
     }

From c16ff96425e3d8dc2282d46bf9894908bd129e77 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Apr 2023 19:54:16 -0700
Subject: [PATCH 260/279] Updating centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 79 +++++++++++++---------
 src/accl/graph/sega/centeral_controller.hh |  6 +-
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 64704510d6..32eae5ce9f 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -290,21 +290,8 @@ CenteralController::handleMemResp(PacketPtr pkt, PortID id)
             delete pkt;
             return true;
         }
-        assert(reqInfoMap.find(pkt->req) != reqInfoMap.end());
-        Addr offset;
-        int num_mirrors;
-        int pkt_size_in_mirrors = pkt->getSize() / sizeof(MirrorVertex);
-        MirrorVertex data[pkt_size_in_mirrors];
-        pkt->writeDataToBlock((uint8_t*) data, pkt->getSize());
-
-        std::tie(offset, num_mirrors) = reqInfoMap[pkt->req];
-        assert(num_mirrors > 0);
-        offset = (int) (offset / sizeof(MirrorVertex));
-        for (int i = 0; i < num_mirrors; i++) {
-            mirrorQueue.push_back(data[i + offset]);
-        }
+        readQueue.push_back(pkt);
         delete pkt;
-
         if (!nextMirrorUpdateEvent.scheduled()) {
             schedule(nextMirrorUpdateEvent, nextCycle());
         }
@@ -376,7 +363,6 @@ CenteralController::processNextMirrorReadEvent()
     std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
     PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
     mirrorsPort.sendPacket(pkt);
-    reqInfoMap[pkt->req] = std::make_tuple(offset, num_mirrors);
     front.iterate();
     if (front.done()) {
         mirrorPointerQueue.pop_front();
@@ -391,32 +377,59 @@ void
 CenteralController::processNextMirrorUpdateEvent()
 {
     int vertex_atom = mpuVector.front()->vertexAtomSize();
-    MirrorVertex front = mirrorQueue.front();
-    Addr org_addr = front.vertexId * sizeof(WorkListItem);
-    Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-    int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
-    int num_items = vertex_atom / sizeof(WorkListItem);
-    WorkListItem data[num_items];
 
-    PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
-    for (auto mpu: mpuVector) {
-        AddrRangeList range_list = addrRangeListMap[mpu];
-        if (contains(range_list, org_addr)) {
-            mpu->recvFunctional(read_org);
+    int num_mirrors_per_atom = vertex_atom / sizeof(MirrorVertex);
+    int num_vertices_per_atom = vertex_atom / sizeof(WorkListItem);
+    MirrorVertex mirrors[num_mirrors_per_atom];
+    WorkListItem vertices[num_vertices_per_atom];
+
+    PacketPtr front = readQueue.front();
+    front->writeDataToBlock((uint8_t*) mirrors, vertex_atom);
+    for (int i = 0; i < num_mirrors_per_atom; i++) {
+        Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+        Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+        int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+
+        PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, org_addr)) {
+                mpu->recvFunctional(read_org);
+            }
+        }
+        read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+        DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
+            workload->printWorkListItem(vertices[wl_offset]), front.to_string());
+        delete read_org;
+
+        if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+            assert(data[wl_offset].degree == 0);
+            vertices[wl_offset].prop = vertices[wl_offset].tempProp;
         }
+        if (mirrors[i].prop != vertices[wl_offset].prop) {
+            mirrors[i].prop = vertices[wl_offset].prop;
+            mirrors[i].activeNow = true;
+        }
+    }
+
+    PacketPtr wb = createWritePacket(
+                    front->getAddr(), front->getSize(), (uint8_t*) mirrors);
+    readQueue.pop_front();
+    delete front;
+
+    if (!nextWriteBackEvent.scheduled()) {
+        schedule(nextWriteBackEvent, nextCycle());
     }
-    read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
-    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
-            workload->printWorkListItem(data[wl_offset]), front.to_string());
-    mirrorQueue.pop_front();
-    delete read_org;
-    if (!mirrorQueue.empty()) {
+    if (!readQueue.empty()) {
         schedule(nextMirrorUpdateEvent, nextCycle());
     }
 }
 
 void
-CenteralController::processNextWriteBackEvent() {}
+CenteralController::processNextWriteBackEvent()
+{
+    PacketPtr front = writeBackQueue.front();
+}
 
 int
 CenteralController::workCount()
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 6f69b0aa81..5b37b37e06 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -130,17 +130,19 @@ class CenteralController : public ClockedObject
     // FIXME: Initialize these two.
     int currentSliceNumber;
     int totalSliceNumber;
+
     int lastReadPacketId;
     std::unordered_map<int, Addr> startAddrs;
     std::unordered_map<int, Addr> endAddrs;
     // TODO: Set a max size for this queue;
     std::deque<MirrorReadInfoGen> mirrorPointerQueue;
-    std::unordered_map<RequestPtr, std::tuple<Addr, int>> reqInfoMap;
 
-    std::deque<MirrorVertex> mirrorQueue;
+    std::deque<PacketPtr> readQueue;
     std::deque<PacketPtr> writeBackQueue;
 
+    int getSliceNumber(Addr vertex_addr);
     PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
     bool handleMemResp(PacketPtr pkt, PortID id);
     void recvReqRetry(PortID id);

From 8fa233d8d19dfa70a4e39221481ce5dc224fde8a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 11 Apr 2023 23:29:39 -0700
Subject: [PATCH 261/279] Completing temporal partitioning.

---
 configs/accl/bfs.py                        |  22 +-
 configs/accl/sega.py                       |  11 +-
 src/accl/graph/base/graph_workload.hh      |   2 +
 src/accl/graph/sega/CenteralController.py  |   9 +-
 src/accl/graph/sega/centeral_controller.cc | 373 ++++++++++-----------
 src/accl/graph/sega/centeral_controller.hh | 107 ++----
 src/accl/graph/sega/coalesce_engine.cc     |   2 +-
 src/accl/graph/sega/coalesce_engine.hh     |   6 +-
 src/accl/graph/sega/enums.cc               |   2 +-
 src/accl/graph/sega/enums.hh               |   9 +-
 src/accl/graph/sega/mpu.cc                 |   7 +
 src/accl/graph/sega/mpu.hh                 |   5 +
 src/accl/graph/sega/push_engine.cc         |  24 ++
 src/accl/graph/sega/push_engine.hh         |   3 +
 src/mem/simple_mem.hh                      |   3 +-
 15 files changed, 274 insertions(+), 311 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 7035b2a535..1fec26a321 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -120,20 +120,24 @@ def get_inputs():
         system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
-            exit_event = m5.simulate(100000000)
+            exit_event = m5.simulate(50000000)
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
             )
-            m5.stats.dump()
-            m5.stats.reset()
-            if exit_event.getCause() != "simulate() limit reached":
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
                 break
     else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ca1f4b9381..982235697a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,6 +64,7 @@ def __init__(self, register_file_size: int, cache_size: str):
             pending_pull_limit=64,
             active_buffer_size=80,
             post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
         )
         self.push_engine = PushEngine(
             push_req_queue_size=32,
@@ -145,7 +146,6 @@ def __init__(
         graph_path,
     ):
         super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
         assert num_gpts != 0
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
@@ -160,23 +160,24 @@ def __init__(
         self.mirror_mem = SimpleMemory(
             latency="90ns",
             latency_var="0ns",
-            bandwidth="28GiB/s",
+            bandwidth="256GiB/s",
             image_file=f"{graph_path}/mirrors",
             range=AddrRange(start=0, size="4GiB"),
             in_addr_map=False,
         )
         self.map_mem = SimpleMemory(
-            latency="90ns",
+            latency="0ns",
             latency_var="0ns",
-            bandwidth="28GiB/s",
+            bandwidth="1024GiB/s",
             image_file=f"{graph_path}/mirrors_map",
             range=AddrRange(start=0, size="4GiB"),
             in_addr_map=False,
         )
         self.ctrl = CenteralController(
             vertex_image_file=f"{graph_path}/vertices",
-            mirrors_mem=self.mirror_mem.port,
+            mem_port=self.mirror_mem.port,
             mirrors_map_mem=self.map_mem.port,
+            mirrors_mem=self.mirror_mem
         )
         # Building the EdgeMemories
         edge_mem = []
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 72748502c1..481cfc146f 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; }
     virtual void iterate() = 0;
     virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
@@ -74,6 +75,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; }
     virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index c5187ba6ec..9bcb237a9b 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -28,20 +28,19 @@
 from m5.params import *
 from m5.proxy import *
 from m5.util.pybind import PyBindMethod
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class CenteralController(ClockedObject):
+class CenteralController(BaseMemoryEngine):
     type = 'CenteralController'
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
-    mirrors_mem = RequestPort("Port to a memory storing vertex mirrors file.")
     mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
 
-    system = Param.System(Parent.any, "System this Engine is a part of")
-
     vertex_image_file = Param.String("Path to the vertex image file.")
 
+    mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
+
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 32eae5ce9f..1f3b7f5ac1 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <cmath>
 #include <iostream>
 
 #include "base/cprintf.hh"
@@ -41,29 +42,27 @@ namespace gem5
 {
 
 CenteralController::CenteralController(const Params& params):
-    ClockedObject(params),
-    system(params.system),
-    mirrorsPort("mirrors_mem", this, 0), mapPort("map_port", this, 1),
-    mode(ProcessingMode::NOT_SET), currentSliceNumber(0), totalSliceNumber(148),
-    lastReadPacketId(0),
-    nextMirrorMapReadEvent([this] { processNextMirrorMapReadEvent(); }, name()),
-    nextMirrorReadEvent([this] { processNextMirrorReadEvent(); }, name()),
-    nextMirrorUpdateEvent([this] { processNextMirrorUpdateEvent(); }, name()),
-    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name())
+    BaseMemoryEngine(params),
+    mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
+    mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
 {
+    uint64_t total_cache_size = 0;
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
         mpu->registerCenteralController(this);
+        total_cache_size += mpu->getCacheSize();
     }
+    verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem));
 }
 
 Port&
 CenteralController::getPort(const std::string& if_name, PortID idx)
 {
-    if (if_name == "mirrors_mem") {
-        return mirrorsPort;
-    } else if (if_name == "mirrors_map_mem") {
+    if (if_name == "mirrors_map_mem") {
         return mapPort;
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort("mem_port", idx);
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -138,7 +137,9 @@ CenteralController::startup()
 {
     unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
-        addrRangeListMap[mpu] = mpu->getAddrRanges();
+        for (auto range: mpu->getAddrRanges()) {
+            mpuAddrMap.insert(range, mpu);
+        }
         mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
@@ -154,14 +155,20 @@ CenteralController::startup()
     loader::MemoryImage vertex_image = object->buildImage();
     maxVertexAddr = vertex_image.maxAddr();
 
+    int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem));
+    numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice);
+
+    numPendingUpdates = new int [numTotalSlices];
+    bestPendingUpdate = new uint32_t [numTotalSlices];
+    for (int i = 0; i < numTotalSlices; i++) {
+        numPendingUpdates[i] = 0;
+        bestPendingUpdate[i] = -1;
+    }
+
     PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, pkt->getAddr())) {
-                mpu->recvFunctional(pkt);
-            }
-        }
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
     }, vertex_atom);
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
@@ -195,40 +202,13 @@ CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 bool
 CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt, _id);
+    panic("recvTimingResp should not be called at all");
 }
 
 void
 CenteralController::ReqPort::recvReqRetry()
 {
-    panic_if(blockedPacket == nullptr,
-            "Received retry without a blockedPacket.");
-
-    DPRINTF(CenteralController, "%s: ReqPort %d received a reqRetry. "
-            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-    if (blockedPacket == nullptr) {
-        DPRINTF(CenteralController, "%s: blockedPacket sent "
-                                "successfully.\n", __func__);
-        owner->recvReqRetry(_id);
-    }
-}
-
-PacketPtr
-CenteralController::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) 0) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
+    panic("recvReqRetry should not be called at all");
 }
 
 void
@@ -256,179 +236,175 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::POLY_GRAPH) {
-        // assert(!nextMirrorMapReadEvent.scheduled());
-        if (!nextMirrorMapReadEvent.scheduled()) {
-            schedule(nextMirrorMapReadEvent, nextCycle());
+        DPRINTF(CenteralController, "%s: Received done signal.\n", __func__);
+        exitSimLoopNow("Finished processing a slice.");
+        if (!nextSliceSwitchEvent.scheduled()) {
+            schedule(nextSliceSwitchEvent, nextCycle());
         }
     }
 }
 
-void
-CenteralController::processNextMirrorMapReadEvent()
+int
+CenteralController::chooseNextSlice()
 {
-    // TODO: In future add functionality to align start_addr and end_addr to
-    // size of the vertex atom.
-    Addr start_addr = currentSliceNumber * totalSliceNumber * sizeof(int);
-    Addr end_addr = start_addr + totalSliceNumber * sizeof(int);
-    PacketPtr start = createReadPacket(start_addr, sizeof(int));
-    PointerTag* start_tag = new PointerTag(lastReadPacketId, PointerType::START);
-    start->pushSenderState(start_tag);
-    PacketPtr end = createReadPacket(end_addr, sizeof(int));
-    PointerTag* end_tag = new PointerTag(lastReadPacketId, PointerType::END);
-    end->pushSenderState(end_tag);
-    lastReadPacketId++;
-    mapPort.sendPacket(start);
-    mapPort.sendPacket(end);
+    int ret_slice_id = -1;
+    int max_pending_count = 0;
+    for (int i = 0; i < numTotalSlices; i++) {
+        if (numPendingUpdates[i] > max_pending_count) {
+            max_pending_count = numPendingUpdates[i];
+            ret_slice_id = i;
+        }
+    }
+    return ret_slice_id;
 }
 
-bool
-CenteralController::handleMemResp(PacketPtr pkt, PortID id)
+void
+CenteralController::processNextSliceSwitchEvent()
 {
-    assert(pkt->isResponse());
-    if (id == 0) {
-        if (pkt->isWrite()) {
-            delete pkt;
-            return true;
-        }
-        readQueue.push_back(pkt);
-        delete pkt;
-        if (!nextMirrorUpdateEvent.scheduled()) {
-            schedule(nextMirrorUpdateEvent, nextCycle());
-        }
-        return true;
-    } else if (id == 1) {
-        PointerTag* tag = pkt->findNextSenderState<PointerTag>();
-        int read_id = tag->Id();
-        PointerType read_type = tag->type();
-        if (read_type == PointerType::START) {
-            assert(startAddrs.find(read_id) == startAddrs.end());
-            startAddrs[read_id] = pkt->getLE<int>();
-            if (endAddrs.find(read_id) != endAddrs.end()) {
-                int vertex_atom = mpuVector.front()->vertexAtomSize();
-                mirrorPointerQueue.emplace_back(
-                    startAddrs[read_id], endAddrs[read_id],
-                    sizeof(MirrorVertex), vertex_atom);
-                if (!nextMirrorReadEvent.scheduled()) {
-                    schedule(nextMirrorReadEvent, nextCycle());
-                }
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem);
+    int bytes_accessed = 0;
+    int updates_generated_total =  0;
+    for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        int updates_generated = 0;
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<int>();
+        Addr end_addr = end->getLE<int>();
+        delete start;
+        delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            currentSliceId, dst_id, start_addr, end_addr);
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex mirrors [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+
+        WorkListItem vertices [vertices_per_atom];
+        for (int i = 0; i < num_mirrors; i++) {
+            Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+            Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+            int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+            PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+            auto routing_entry = mpuAddrMap.contains(aligned_org_addr);
+            routing_entry->second->recvFunctional(read_org);
+            read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+            delete read_org;
+             if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+                assert(vertices[wl_offset].degree == 0);
+                vertices[wl_offset].prop = vertices[wl_offset].tempProp;
             }
-        } else {
-            assert(read_type == PointerType::END);
-            assert(endAddrs.find(read_id) == endAddrs.end());
-            endAddrs[read_id] = pkt->getLE<int>();
-            if (startAddrs.find(read_id) != startAddrs.end()) {
-                int vertex_atom = mpuVector.front()->vertexAtomSize();
-                mirrorPointerQueue.emplace_back(
-                    startAddrs[read_id], endAddrs[read_id],
-                    sizeof(MirrorVertex), vertex_atom);
-                if (!nextMirrorReadEvent.scheduled()) {
-                    schedule(nextMirrorReadEvent, nextCycle());
+            if (mirrors[i].prop != vertices[wl_offset].prop) {
+                mirrors[i].prop = vertices[wl_offset].prop;
+                if (!mirrors[i].activeNow) {
+                    mirrors[i].activeNow = true;
+                    numPendingUpdates[dst_id]++;
+                    totalUpdatesLeft++;
+                    updates_generated++;
                 }
+                bestPendingUpdate[dst_id] =
+                    workload->reduce(bestPendingUpdate[dst_id], mirrors[i].prop);
             }
         }
-        DPRINTF(CenteralController, "%s: Received pkt: %s from port %d "
-                                    "with value: %d.\n", __func__,
-                                    pkt->print(), id, pkt->getLE<int>());
-        delete tag;
-        delete pkt;
-        return true;
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        DPRINTF(CenteralController, "%s: Done scattering updates from slice "
+                        "%d to slice %d.\n", __func__, currentSliceId, dst_id);
+        DPRINTF(CenteralController, "%s: Generated %d updates from slice "
+                                        "%d to slice %d.\n", __func__,
+                                    updates_generated, currentSliceId, dst_id);
+        updates_generated_total += updates_generated;
+        bytes_accessed += 2 * num_bytes;
+    }
+    DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId);
+    DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n",
+                                        __func__, updates_generated_total);
+    DPRINTF(CenteralController, "%s: There are a total of %d "
+                                "updates left.\n", __func__, totalUpdatesLeft);
+    if (totalUpdatesLeft > 0) {
+        currentSliceId = chooseNextSlice();
     } else {
-        panic("did not expect this.");
+        exitSimLoopNow("Done with all the slices.");
+        return;
     }
-}
-
-void
-CenteralController::recvReqRetry(PortID id) {
-    if (id == 0) {
-        assert(!nextMirrorReadEvent.scheduled());
-        if (!mirrorPointerQueue.empty()) {
-            schedule(nextMirrorReadEvent, nextCycle());
+    DPRINTF(CenteralController, "%s: Chose %d as the "
+                                    "next slice.\n", __func__, currentSliceId);
+
+    for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<int>();
+        Addr end_addr = end->getLE<int>();
+        delete start;
+        delete end;
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex mirrors [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+        for (int i = 0; i < num_mirrors; i++) {
+            if (mirrors[i].activeNow) {
+                Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+                auto routing_entry = mpuAddrMap.contains(org_addr);
+                routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop,
+                                        mirrors[i].edgeIndex, mirrors[i].degree);
+                mirrors[i].activeNow = false;
+                numPendingUpdates[currentSliceId]--;
+                totalUpdatesLeft--;
+            }
         }
-    } else if (id == 1) {
-        DPRINTF(CenteralController, "%s: Ignoring reqRetry "
-                            "for port %d.\n", __func__, id);
-    } else {
-        panic("Did not expect the other.");
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        DPRINTF(CenteralController, "%s: Done gathering updates from slice "
+                        "%d to slice %d.\n", __func__, src_id, currentSliceId);
+        bytes_accessed += num_bytes;
     }
-}
-
-void
-CenteralController::processNextMirrorReadEvent()
-{
-    Addr aligned_addr, offset;
-    int num_mirrors;
 
-    int vertex_atom = mpuVector.front()->vertexAtomSize();
-    MirrorReadInfoGen& front = mirrorPointerQueue.front();
-    std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
-    PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
-    mirrorsPort.sendPacket(pkt);
-    front.iterate();
-    if (front.done()) {
-        mirrorPointerQueue.pop_front();
+    double mirror_mem_bw = mirrorsMem->getBW();
+    Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    for (auto mpu: mpuVector) {
+        mpu->startProcessingMirrors(time_to_switch);
     }
+    exitSimLoopNow("Done with slice switch.");
+}
 
-    if (!mirrorPointerQueue.empty() && !mirrorsPort.blocked()) {
-        schedule(nextMirrorReadEvent, nextCycle());
-    }
+bool
+CenteralController::handleMemResp(PacketPtr pkt)
+{
+    panic("handleMemResp should not be called at all");
 }
 
 void
-CenteralController::processNextMirrorUpdateEvent()
+CenteralController::recvMemRetry()
 {
-    int vertex_atom = mpuVector.front()->vertexAtomSize();
-
-    int num_mirrors_per_atom = vertex_atom / sizeof(MirrorVertex);
-    int num_vertices_per_atom = vertex_atom / sizeof(WorkListItem);
-    MirrorVertex mirrors[num_mirrors_per_atom];
-    WorkListItem vertices[num_vertices_per_atom];
-
-    PacketPtr front = readQueue.front();
-    front->writeDataToBlock((uint8_t*) mirrors, vertex_atom);
-    for (int i = 0; i < num_mirrors_per_atom; i++) {
-        Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
-        Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-        int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
-
-        PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, org_addr)) {
-                mpu->recvFunctional(read_org);
-            }
-        }
-        read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
-        DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
-            workload->printWorkListItem(vertices[wl_offset]), front.to_string());
-        delete read_org;
-
-        if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
-            assert(data[wl_offset].degree == 0);
-            vertices[wl_offset].prop = vertices[wl_offset].tempProp;
-        }
-        if (mirrors[i].prop != vertices[wl_offset].prop) {
-            mirrors[i].prop = vertices[wl_offset].prop;
-            mirrors[i].activeNow = true;
-        }
-    }
-
-    PacketPtr wb = createWritePacket(
-                    front->getAddr(), front->getSize(), (uint8_t*) mirrors);
-    readQueue.pop_front();
-    delete front;
-
-    if (!nextWriteBackEvent.scheduled()) {
-        schedule(nextWriteBackEvent, nextCycle());
-    }
-    if (!readQueue.empty()) {
-        schedule(nextMirrorUpdateEvent, nextCycle());
-    }
+    panic("recvMemRetry should not be called at all");
 }
 
 void
-CenteralController::processNextWriteBackEvent()
+CenteralController::recvFunctional(PacketPtr pkt)
 {
-    PacketPtr front = writeBackQueue.front();
+    panic("recvFunctional should not be called at all");
 }
 
 int
@@ -457,12 +433,8 @@ CenteralController::printAnswerToHostSimout()
     for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
     {
         PacketPtr pkt = createReadPacket(addr, vertex_atom);
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, addr)) {
-                mpu->recvFunctional(pkt);
-            }
-        }
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
         pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
         for (int i = 0; i < num_items; i++) {
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
@@ -470,6 +442,7 @@ CenteralController::printAnswerToHostSimout()
 
             std::cout << print << std::endl;
         }
+        delete pkt;
     }
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 5b37b37e06..67c4a9593e 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -29,22 +29,23 @@
 #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 
+#include <cmath>
 #include <vector>
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "base/intmath.hh"
+#include "mem/simple_mem.hh"
 #include "params/CenteralController.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class CenteralController : public ClockedObject
+class CenteralController : public BaseMemoryEngine
 {
   private:
     class ReqPort : public RequestPort
@@ -67,108 +68,44 @@ class CenteralController : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    struct PointerTag : public Packet::SenderState
-    {
-        int _id;
-        PointerType _type;
-        PointerTag(int id, PointerType type): _id(id), _type(type) {}
-        int Id() { return _id; }
-        PointerType type() { return _type; }
-
-    };
-
-    class MirrorReadInfoGen {
-      private:
-        Addr _start;
-        Addr _end;
-        size_t _step;
-        size_t _atom;
-
-      public:
-        MirrorReadInfoGen(Addr start, Addr end, size_t step, size_t atom):
-                        _start(start), _end(end), _step(step), _atom(atom)
-        {}
-
-        std::tuple<Addr, Addr, int> nextReadPacketInfo()
-        {
-            panic_if(done(), "Should not call nextPacketInfo when done.\n");
-            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
-            Addr offset = _start - aligned_addr;
-            int num_items = 0;
-
-            if (_end > (aligned_addr + _atom)) {
-                num_items = (_atom - offset) / _step;
-            } else {
-                num_items = (_end - _start) / _step;
-            }
-
-            return std::make_tuple(aligned_addr, offset, num_items);
-        }
-
-        void iterate()
-        {
-            panic_if(done(), "Should not call iterate when done.\n");
-            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
-            _start = aligned_addr + _atom;
-        }
-
-        bool done() { return (_start >= _end); }
-    };
-
-    System* system;
-
-    ReqPort mirrorsPort;
     ReqPort mapPort;
-
     Addr maxVertexAddr;
-
     ProcessingMode mode;
 
-    std::vector<MPU*> mpuVector;
-    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
-
-    // FIXME: Initialize these two.
-    int currentSliceNumber;
-    int totalSliceNumber;
-
-    int lastReadPacketId;
-    std::unordered_map<int, Addr> startAddrs;
-    std::unordered_map<int, Addr> endAddrs;
-    // TODO: Set a max size for this queue;
-    std::deque<MirrorReadInfoGen> mirrorPointerQueue;
-
-    std::deque<PacketPtr> readQueue;
-    std::deque<PacketPtr> writeBackQueue;
+    memory::SimpleMemory* mirrorsMem;
 
-    int getSliceNumber(Addr vertex_addr);
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+    std::vector<MPU*> mpuVector;
+    AddrRangeMap<MPU*> mpuAddrMap;
 
-    bool handleMemResp(PacketPtr pkt, PortID id);
-    void recvReqRetry(PortID id);
+    int currentSliceId;
+    int numTotalSlices;
+    int verticesPerSlice;
+    int totalUpdatesLeft;
 
-    EventFunctionWrapper nextMirrorMapReadEvent;
-    void processNextMirrorMapReadEvent();
+    int* numPendingUpdates;
+    uint32_t* bestPendingUpdate;
 
-    EventFunctionWrapper nextMirrorReadEvent;
-    void processNextMirrorReadEvent();
+    int chooseNextSlice();
 
-    EventFunctionWrapper nextMirrorUpdateEvent;
-    void processNextMirrorUpdateEvent();
+    EventFunctionWrapper nextSliceSwitchEvent;
+    void processNextSliceSwitchEvent();
 
-    EventFunctionWrapper nextWriteBackEvent;
-    void processNextWriteBackEvent();
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
 
   public:
     GraphWorkload* workload;
 
     PARAMS(CenteralController);
-    CenteralController(const CenteralControllerParams &params);
+    CenteralController(const Params& params);
     Port& getPort(const std::string& if_name,
                 PortID idx = InvalidPortID) override;
 
     virtual void startup() override;
 
+    virtual void recvFunctional(PacketPtr pkt) override;
+
     void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
     void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
     void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 083e8d4c37..9938034a88 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -759,7 +759,7 @@ CoalesceEngine::processNextMemoryEvent()
 {
     int num_transitions = 0;
     std::unordered_set<int> transitions;
-    FunctionDeque temp_deque;
+    MemoryFunctionDeque temp_deque;
     temp_deque.clear();
 
     while (true) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4066c7dbe5..9de401cf81 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> FunctionDeque;
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> MemoryFunctionDeque;
 
 class MPU;
 
@@ -144,7 +144,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
 
     int transitionsPerCycle;
-    FunctionDeque memAccBuffer;
+    MemoryFunctionDeque memAccBuffer;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -217,7 +217,7 @@ class CoalesceEngine : public BaseMemoryEngine
     void createBSPPopCountDirectory(int atoms_per_block);
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
 
-    virtual void recvFunctional(PacketPtr pkt);
+    virtual void recvFunctional(PacketPtr pkt) override;
     void postMemInitSetup();
     void postConsumeProcess();
     void swapDirectories();
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 5b8de3404f..c85c60fd8d 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -66,6 +66,6 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
     "BULK_SYNCHRONOUS"
 };
 
-const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"N/A", "START", "END"};
+const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"START", "END"};
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 92e293bec0..194fdc2140 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -79,9 +79,16 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum PointerType
+enum PGMode
 {
     NA,
+    SCATTER,
+    GATHER,
+    NUM_PG_MODE
+};
+
+enum PointerType
+{
     START,
     END,
     NUM_POINTER_TYPE
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f661bd68a6..a5063cf685 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -87,6 +87,13 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
     pushEngine->recvVertexPush(addr, delta, edge_index, degree);
 }
 
+void
+MPU::recvMirrorPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvMirrorPush(addr, delta, edge_index, degree);
+}
+
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 95d3adeca5..4afb2081ca 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -69,6 +69,7 @@ class MPU : public SimObject
 
     unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    uint64_t getCacheSize() { return coalesceEngine->params().cache_size; }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
     void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
@@ -88,6 +89,10 @@ class MPU : public SimObject
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
 
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); }
+
     void recvDoneSignal();
     bool done();
 };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 3279fb9450..2584cf02af 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -218,6 +218,30 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
     }
 }
 
+void
+PushEngine::recvMirrorPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+}
+
+void
+PushEngine::startProcessingMirrors(Tick time_to_wait)
+{
+    assert(!nextMemoryReadEvent.pending());
+    assert(!nextMemoryReadEvent.scheduled());
+    Cycles wait = ticksToCycles(time_to_wait);
+    if (!edgePointerQueue.empty()) {
+        schedule(nextMemoryReadEvent, clockEdge(wait));
+    }
+}
+
 void
 PushEngine::processNextMemoryReadEvent()
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 08a5d278f5..2aced4b156 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -209,6 +209,9 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait);
 
     void recvReqRetry();
 
diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh
index 75a03fbe0e..0be85e9d86 100644
--- a/src/mem/simple_mem.hh
+++ b/src/mem/simple_mem.hh
@@ -180,7 +180,6 @@ class SimpleMemory : public AbstractMemory
     std::unique_ptr<Packet> pendingDelete;
 
   public:
-
     SimpleMemory(const SimpleMemoryParams &p);
 
     DrainState drain() override;
@@ -189,6 +188,8 @@ class SimpleMemory : public AbstractMemory
                   PortID idx=InvalidPortID) override;
     void init() override;
 
+    double getBW() { return bandwidth; }
+
   protected:
     Tick recvAtomic(PacketPtr pkt);
     Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor);

From d87e030d18b434ef3407d619873462c76bf9d670 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Apr 2023 01:56:31 -0700
Subject: [PATCH 262/279] Cleaning up and merging temp partition.

---
 configs/accl/bfs.py                        |   1 +
 configs/accl/sega.py                       | 107 ++++++++++++--------
 configs/accl/sega_simple.py                | 109 ++++++++++++++++-----
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/PushEngine.py          |   3 +
 src/accl/graph/sega/WLEngine.py            |   6 +-
 src/accl/graph/sega/centeral_controller.cc |  20 +++-
 src/accl/graph/sega/centeral_controller.hh |   2 +-
 src/accl/graph/sega/enums.cc               |  10 --
 src/accl/graph/sega/enums.hh               |  16 ---
 src/accl/graph/sega/push_engine.cc         |   3 +-
 src/accl/graph/sega/push_engine.hh         |   1 +
 src/accl/graph/sega/wl_engine.cc           |   4 +-
 src/accl/graph/sega/wl_engine.hh           |   3 +-
 14 files changed, 182 insertions(+), 106 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 1fec26a321..68bb53d33c 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -107,6 +107,7 @@ def get_inputs():
     else:
         from sega import SEGA
     system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 982235697a..e73a6d1843 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -52,10 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
+            examine_window=8,
             rd_per_cycle=4,
             reduce_per_cycle=32,
             wr_per_cycle=4,
-            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -69,9 +69,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=4096,
+            resp_queue_size=1024,
+            examine_window=12,
             max_propagates_per_cycle=8,
-            update_queue_size=32,
+            update_queue_size=64,
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
@@ -137,6 +138,43 @@ def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
 
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="4GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controlller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
 class SEGA(System):
     def __init__(
         self,
@@ -156,30 +194,9 @@ def __init__(
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        # Building the CenteralController
-        self.mirror_mem = SimpleMemory(
-            latency="90ns",
-            latency_var="0ns",
-            bandwidth="256GiB/s",
-            image_file=f"{graph_path}/mirrors",
-            range=AddrRange(start=0, size="4GiB"),
-            in_addr_map=False,
-        )
-        self.map_mem = SimpleMemory(
-            latency="0ns",
-            latency_var="0ns",
-            bandwidth="1024GiB/s",
-            image_file=f"{graph_path}/mirrors_map",
-            range=AddrRange(start=0, size="4GiB"),
-            in_addr_map=False,
-        )
-        self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices",
-            mem_port=self.mirror_mem.port,
-            mirrors_map_mem=self.map_mem.port,
-            mirrors_mem=self.mirror_mem
-        )
-        # Building the EdgeMemories
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
         edge_mem = []
         for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
@@ -207,46 +224,52 @@ def __init__(
                 gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
 
     def work_count(self):
-        return self.ctrl.workCount()
+        return self.ctrl.controller.workCount()
 
     def set_async_mode(self):
-        self.ctrl.setAsyncMode()
+        self.ctrl.controller.setAsyncMode()
 
     def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
+        self.ctrl.controller.setBSPMode()
 
     def set_pg_mode(self):
-        self.ctrl.setPGMode()
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
 
     def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
 
     def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
 
     def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
 
     def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
+        self.ctrl.controller.createCCWorkload()
 
     def create_async_pr_workload(self, alpha, threshold):
-        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
 
     def create_pr_workload(self, num_nodes, alpha):
-        self.ctrl.createPRWorkload(num_nodes, alpha)
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
 
     def get_pr_error(self):
-        return self.ctrl.getPRError()
+        return self.ctrl.controller.getPRError()
 
     def create_bc_workload(self, init_addr, init_value):
-        self.ctrl.createBCWorkload(init_addr, init_value)
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 312e721b0c..e1c73765bc 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -52,9 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
-            rd_per_cycle=2,
+            examine_window=8,
+            rd_per_cycle=4,
             reduce_per_cycle=32,
-            wr_per_cycle=2,
+            wr_per_cycle=4,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -63,17 +64,19 @@ def __init__(self, register_file_size: int, cache_size: str):
             pending_pull_limit=64,
             active_buffer_size=80,
             post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
         )
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=4096,
+            resp_queue_size=1024,
+            examine_window=12,
             max_propagates_per_cycle=8,
-            update_queue_size=32,
+            update_queue_size=64,
         )
 
         self.vertex_mem_ctrl = SimpleMemory(
-            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            latency="120ns", bandwidth="28GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -130,10 +133,52 @@ def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
 
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="4GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controlller.mem_port = self.controlller.mirrors_mem.port
+        self.controlller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controlller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controlller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controlller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controlller.mpu_vector = mpu_vector
+
+
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
         super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
         assert num_gpts != 0
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
@@ -144,11 +189,9 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        # Building the CenteralController
-        self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices"
-        )
-        # Building the EdgeMemories
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
         edge_mem = []
         for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
@@ -162,7 +205,9 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -173,40 +218,52 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
                 gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
 
     def work_count(self):
-        return self.ctrl.workCount()
+        return self.ctrl.controller.workCount()
 
     def set_async_mode(self):
-        self.ctrl.setAsyncMode()
+        self.ctrl.controller.setAsyncMode()
 
     def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
 
     def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
 
     def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
 
     def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
 
     def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
+        self.ctrl.controller.createCCWorkload()
 
     def create_async_pr_workload(self, alpha, threshold):
-        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
 
     def create_pr_workload(self, num_nodes, alpha):
-        self.ctrl.createPRWorkload(num_nodes, alpha)
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
 
     def create_bc_workload(self, init_addr, init_value):
-        self.ctrl.createBCWorkload(init_addr, init_value)
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9bcb237a9b..d40998d584 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -37,6 +37,9 @@ class CenteralController(BaseMemoryEngine):
 
     mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
 
+    choose_best = Param.Bool("Whether to prefer the best update "
+                            "value for choosing the next slice")
+
     vertex_image_file = Param.String("Path to the vertex image file.")
 
     mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 63fa1eae62..2174f943f4 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -42,6 +42,9 @@ class PushEngine(BaseMemoryEngine):
                                     "push engine where it stores the "
                                     "edges read from memory.")
 
+    examine_window = Param.Int("Number of edges at the front of the edge queue"
+                                " to examine in order to propagate.")
+
     max_propagates_per_cycle = Param.Int("Maximum number of propagates "
                                                         "done per cycle.")
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index cfec70081d..f9ea4488df 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -45,8 +45,10 @@ class WLEngine(BaseReduceEngine):
                                     "many updates as this queueu has "
                                     "entries at the same time.")
 
+    examine_window = Param.Int("Number of updates at the front of update "
+                                "queue examined for reading.")
     rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
     reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
     wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
-    
-    num_updates_processed = Param.Int("Maximum number of updates processed")
+
+
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 1f3b7f5ac1..7f06ef245e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -45,6 +45,7 @@ CenteralController::CenteralController(const Params& params):
     BaseMemoryEngine(params),
     mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
     mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    chooseBest(params.choose_best),
     nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
 {
     uint64_t total_cache_size = 0;
@@ -247,15 +248,26 @@ CenteralController::recvDoneSignal()
 int
 CenteralController::chooseNextSlice()
 {
-    int ret_slice_id = -1;
+    int crowded_slice_id = -1;
     int max_pending_count = 0;
+    // TODO: Make this general for all workloads
+    uint32_t best_update = -1;
+    int best_slice_id = -1;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
-            ret_slice_id = i;
+            crowded_slice_id = i;
         }
+        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+            best_update = bestPendingUpdate[i];
+            best_slice_id = i;
+        }
+    }
+    if (chooseBest) {
+        return best_slice_id;
+    } else {
+        return crowded_slice_id;
     }
-    return ret_slice_id;
 }
 
 void
@@ -312,7 +324,7 @@ CenteralController::processNextSliceSwitchEvent()
                     updates_generated++;
                 }
                 bestPendingUpdate[dst_id] =
-                    workload->reduce(bestPendingUpdate[dst_id], mirrors[i].prop);
+                    workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]);
             }
         }
         PacketPtr write_mirrors =
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 67c4a9593e..7cafa3e54f 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -82,9 +82,9 @@ class CenteralController : public BaseMemoryEngine
     int verticesPerSlice;
     int totalUpdatesLeft;
 
+    bool chooseBest;
     int* numPendingUpdates;
     uint32_t* bestPendingUpdate;
-
     int chooseNextSlice();
 
     EventFunctionWrapper nextSliceSwitchEvent;
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index c85c60fd8d..ba57b387f4 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -58,14 +58,4 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] =
     "READ_FOR_PUSH"
 };
 
-const char* processingModeStrings[NUM_PROCESSING_MODE] =
-{
-    "NOT_SET",
-    "ASYNCHRONOUS",
-    "POLY_GRAPH",
-    "BULK_SYNCHRONOUS"
-};
-
-const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"START", "END"};
-
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 194fdc2140..0f654c5386 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -79,22 +79,6 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum PGMode
-{
-    NA,
-    SCATTER,
-    GATHER,
-    NUM_PG_MODE
-};
-
-enum PointerType
-{
-    START,
-    END,
-    NUM_POINTER_TYPE
-};
-extern const char* pointerTypeStrings[NUM_POINTER_TYPE];
-
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 2584cf02af..7035a53e93 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,6 +43,7 @@ PushEngine::PushEngine(const Params& params):
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    examineWindow(params.examine_window),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
@@ -341,7 +342,7 @@ PushEngine::processNextPropagateEvent()
     int num_tries = 0;
     int num_reads = 0;
     std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
-    for (int i = 0; i < maxPropagatesPerCycle; i++) {
+    for (int i = 0; i < examineWindow; i++) {
         if (metaEdgeQueue.empty()) {
             break;
         }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2aced4b156..0108a2d7ef 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -136,6 +136,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
+    int examineWindow;
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index c294441703..0b64e09d67 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -44,10 +44,10 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
+    examineWindow(params.examine_window),
     maxReadsPerCycle(params.rd_per_cycle),
     maxReducesPerCycle(params.reduce_per_cycle),
     maxWritesPerCycle(params.wr_per_cycle),
-    maxUpdatesProcessed(params.num_updates_processed),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
@@ -211,7 +211,7 @@ void
 WLEngine::processNextReadEvent()
 {
     std::deque<std::tuple<Addr, Tick>> temp_queue;
-    for (int i = 0; i < maxUpdatesProcessed; i++) {
+    for (int i = 0; i < examineWindow; i++) {
         if (updateQueue.empty()) {
             break;
         }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index bb8e82f501..2c08e4e273 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -80,12 +80,11 @@ class WLEngine : public BaseReduceEngine
     std::deque<std::tuple<Addr, Tick>> updateQueue;
     std::unordered_map<Addr, uint32_t> valueMap;
 
+    int examineWindow;
     int maxReadsPerCycle;
     int maxReducesPerCycle;
     int maxWritesPerCycle;
 
-    int maxUpdatesProcessed;
-
     int registerFileSize;
     std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;

From 791d9af55396ee5cf095ee2ffaa4fb5a7c2a3b65 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Apr 2023 08:24:59 -0700
Subject: [PATCH 263/279] Updating config scripts.

---
 configs/accl/bfs.py         | 39 ++++++++++++++++++++++++++++++++++---
 configs/accl/sega.py        |  2 +-
 configs/accl/sega_simple.py | 20 +++++++++----------
 3 files changed, 46 insertions(+), 15 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 68bb53d33c..e62719729c 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -39,6 +39,22 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
     argparser.add_argument(
         "--visited",
         dest="visited",
@@ -81,6 +97,8 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.tile,
+        args.best,
         args.visited,
         args.simple,
         args.sample,
@@ -96,6 +114,8 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        tile,
+        best,
         visited,
         simple,
         sample,
@@ -106,14 +126,23 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
+
     system = SEGA(num_gpts, num_registers, cache_size, graph)
-    system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
+    if tile:
+        system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
+
+    if best:
+        system.set_choose_best(True)
+
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
-    # system.set_async_mode()
-    system.set_pg_mode()
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
     system.create_pop_count_directory(64)
     if visited:
         system.create_bfs_visited_workload(init_addr, init_value)
@@ -131,6 +160,8 @@ def get_inputs():
                 m5.stats.reset()
             elif exit_event.getCause() == "Done with all the slices.":
                 break
+            elif exit_event.getCause() == "no update left to process.":
+                break
     else:
         while True:
             exit_event = m5.simulate()
@@ -140,5 +171,7 @@ def get_inputs():
             )
             if exit_event.getCause() == "Done with all the slices.":
                 break
+            if exit_event.getCause() == "no update left to process.":
+                break
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e73a6d1843..2df36fab20 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -162,7 +162,7 @@ def __init__(self, mirror_bw):
         self.controller.mirrors_map_mem = self.map_mem.port
 
     def set_choose_best(self, choose_best):
-        self.controlller.choose_best = choose_best
+        self.controller.choose_best = choose_best
 
     def set_vertices_image(self, vertices):
         self.controller.vertex_image_file = vertices
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index e1c73765bc..516a7968a8 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -76,7 +76,7 @@ def __init__(self, register_file_size: int, cache_size: str):
         )
 
         self.vertex_mem_ctrl = SimpleMemory(
-            latency="120ns", bandwidth="28GiB/s"
+            latency="120ns", bandwidth="256GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -153,21 +153,21 @@ def __init__(self, mirror_bw):
                 in_addr_map=False,
             ),
         )
-        self.controlller.mem_port = self.controlller.mirrors_mem.port
-        self.controlller.mirrors_map_mem = self.map_mem.port
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
 
     def set_choose_best(self, choose_best):
-        self.controlller.choose_best = choose_best
+        self.controller.choose_best = choose_best
 
     def set_vertices_image(self, vertices):
-        self.controlller.vertex_image_file = vertices
+        self.controller.vertex_image_file = vertices
 
     def set_aux_images(self, mirrors, mirrors_map):
-        self.controlller.mirrors_mem.image_file = mirrors
+        self.controller.mirrors_mem.image_file = mirrors
         self.map_mem.image_file = mirrors_map
 
     def set_mpu_vector(self, mpu_vector):
-        self.controlller.mpu_vector = mpu_vector
+        self.controller.mpu_vector = mpu_vector
 
 
 class SEGA(System):
@@ -205,9 +205,7 @@ def __init__(
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
+            gpt.set_vertex_range(vertex_ranges[i])
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -233,7 +231,7 @@ def set_pg_mode(self):
         self.ctrl.controller.setPGMode()
 
     def set_aux_images(self, mirrors, mirrors_map):
-        self.ctrl.set_images(mirrors, mirrors_map)
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
 
     def set_choose_best(self, choose_best):
         self.ctrl.set_choose_best(choose_best)

From d126c2424285fde8bb8d1df719917d0ba434602c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:14:22 -0700
Subject: [PATCH 264/279] Adding stats to centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 44 ++++++++++++++++++++--
 src/accl/graph/sega/centeral_controller.hh | 15 ++++++++
 src/accl/graph/sega/coalesce_engine.cc     |  6 +--
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 src/accl/graph/sega/push_engine.cc         |  5 +--
 src/accl/graph/sega/push_engine.hh         |  2 +-
 src/accl/graph/sega/wl_engine.cc           |  5 +--
 src/accl/graph/sega/wl_engine.hh           |  2 +-
 8 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7f06ef245e..472e623a66 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -46,7 +46,8 @@ CenteralController::CenteralController(const Params& params):
     mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
     mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
     chooseBest(params.choose_best),
-    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()),
+    stats(*this)
 {
     uint64_t total_cache_size = 0;
     for (auto mpu : params.mpu_vector) {
@@ -253,13 +254,16 @@ CenteralController::chooseNextSlice()
     // TODO: Make this general for all workloads
     uint32_t best_update = -1;
     int best_slice_id = -1;
+    int max_best_pending_count = 0;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
             crowded_slice_id = i;
         }
-        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (numPendingUpdates[i] > max_best_pending_count &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
+            max_best_pending_count = numPendingUpdates[i];
             best_slice_id = i;
         }
     }
@@ -278,6 +282,9 @@ CenteralController::processNextSliceSwitchEvent()
     int bytes_accessed = 0;
     int updates_generated_total =  0;
     for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        if (dst_id == currentSliceId) {
+            continue;
+        }
         int updates_generated = 0;
         Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
         Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
@@ -294,7 +301,7 @@ CenteralController::processNextSliceSwitchEvent()
 
         int num_bytes = end_addr - start_addr;
         int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
-        MirrorVertex mirrors [num_mirrors];
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
         memPort.sendFunctional(read_mirrors);
@@ -331,6 +338,7 @@ CenteralController::processNextSliceSwitchEvent()
                     createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
         memPort.sendFunctional(write_mirrors);
         delete write_mirrors;
+        delete [] mirrors;
         DPRINTF(CenteralController, "%s: Done scattering updates from slice "
                         "%d to slice %d.\n", __func__, currentSliceId, dst_id);
         DPRINTF(CenteralController, "%s: Generated %d updates from slice "
@@ -354,6 +362,9 @@ CenteralController::processNextSliceSwitchEvent()
                                     "next slice.\n", __func__, currentSliceId);
 
     for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        if (src_id == currentSliceId) {
+            continue;
+        }
         Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
         Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
@@ -367,7 +378,7 @@ CenteralController::processNextSliceSwitchEvent()
 
         int num_bytes = end_addr - start_addr;
         int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
-        MirrorVertex mirrors [num_mirrors];
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
         memPort.sendFunctional(read_mirrors);
@@ -388,6 +399,7 @@ CenteralController::processNextSliceSwitchEvent()
                     createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
         memPort.sendFunctional(write_mirrors);
         delete write_mirrors;
+        delete [] mirrors;
         DPRINTF(CenteralController, "%s: Done gathering updates from slice "
                         "%d to slice %d.\n", __func__, src_id, currentSliceId);
         bytes_accessed += num_bytes;
@@ -395,6 +407,9 @@ CenteralController::processNextSliceSwitchEvent()
 
     double mirror_mem_bw = mirrorsMem->getBW();
     Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    stats.switchTicks += time_to_switch;
+    stats.switchedBytes += bytes_accessed;
+    stats.numSwitches++;
     for (auto mpu: mpuVector) {
         mpu->startProcessingMirrors(time_to_switch);
     }
@@ -458,4 +473,25 @@ CenteralController::printAnswerToHostSimout()
     }
 }
 
+CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl):
+    statistics::Group(&_ctrl), ctrl(_ctrl),
+    ADD_STAT(numSwitches, statistics::units::Byte::get(),
+             "Number of slices switches completed."),
+    ADD_STAT(switchedBytes, statistics::units::Byte::get(),
+             "Number of bytes accessed during slice switching."),
+    ADD_STAT(switchTicks, statistics::units::Tick::get(),
+             "Number of ticks spent switching slices."),
+    ADD_STAT(switchSeconds, statistics::units::Second::get(),
+             "Traversed Edges Per Second.")
+{
+}
+
+void
+CenteralController::ControllerStats::regStats()
+{
+    using namespace statistics;
+
+    switchSeconds = switchTicks / simFreq;
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 7cafa3e54f..b5acff672d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -90,6 +90,21 @@ class CenteralController : public BaseMemoryEngine
     EventFunctionWrapper nextSliceSwitchEvent;
     void processNextSliceSwitchEvent();
 
+    struct ControllerStats : public statistics::Group
+    {
+      ControllerStats(CenteralController& ctrl);
+
+      void regStats() override;
+
+      CenteralController& ctrl;
+
+      statistics::Scalar numSwitches;
+      statistics::Scalar switchedBytes;
+      statistics::Scalar switchTicks;
+      statistics::Formula switchSeconds;
+    };
+    ControllerStats stats;
+
   protected:
     virtual void recvMemRetry() override;
     virtual bool handleMemResp(PacketPtr pkt) override;
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 9938034a88..5e0c8c8095 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1237,10 +1237,8 @@ CoalesceEngine::processNextDoneSignalEvent()
     }
 }
 
-CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
-    : statistics::Group(&_coalesce),
-    coalesce(_coalesce),
-    lastResetTick(0),
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce):
+    statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 9de401cf81..3a9e463595 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,7 +164,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     struct CoalesceStats : public statistics::Group
     {
-        CoalesceStats(CoalesceEngine &coalesce);
+        CoalesceStats(CoalesceEngine& coalesce);
 
         virtual void regStats() override;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 7035a53e93..eea41448da 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -515,9 +515,8 @@ PushEngine::processNextUpdatePushEvent()
     }
 }
 
-PushEngine::PushStats::PushStats(PushEngine &_push)
-    : statistics::Group(&_push),
-    push(_push),
+PushEngine::PushStats::PushStats(PushEngine& _push):
+    statistics::Group(&_push), push(_push),
     ADD_STAT(numPropagates, statistics::units::Count::get(),
              "Number of propagate operations done."),
     ADD_STAT(updateQueueFull, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 0108a2d7ef..7170d2d22e 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -165,7 +165,7 @@ class PushEngine : public BaseMemoryEngine
 
     struct PushStats : public statistics::Group
     {
-      PushStats(PushEngine &push);
+      PushStats(PushEngine& push);
 
       void regStats() override;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0b64e09d67..8e5ccc9ebe 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -457,9 +457,8 @@ WLEngine::processNextDoneSignalEvent()
     }
 }
 
-WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
-    : statistics::Group(&_wl),
-    wl(_wl),
+WLEngine::WorkListStats::WorkListStats(WLEngine& _wl):
+    statistics::Group(&_wl), wl(_wl),
     ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
              "Number of coalescions in the update queues."),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2c08e4e273..ad67f19cb5 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -107,7 +107,7 @@ class WLEngine : public BaseReduceEngine
 
     struct WorkListStats : public statistics::Group
     {
-      WorkListStats(WLEngine &worklist);
+      WorkListStats(WLEngine& worklist);
 
       void regStats() override;
 

From f710081953f69521a0550bb6822fad23e46aad5a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:35:07 -0700
Subject: [PATCH 265/279] Updating choosing next slice.

---
 src/accl/graph/sega/centeral_controller.cc | 31 ++++++++++++++--------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 472e623a66..b4492ff91b 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -249,29 +249,38 @@ CenteralController::recvDoneSignal()
 int
 CenteralController::chooseNextSlice()
 {
-    int crowded_slice_id = -1;
+    int ret_slice_id = -1;
     int max_pending_count = 0;
-    // TODO: Make this general for all workloads
+    // TODO: Make this generalizable for all workloads.
     uint32_t best_update = -1;
-    int best_slice_id = -1;
-    int max_best_pending_count = 0;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
-            crowded_slice_id = i;
         }
-        if (numPendingUpdates[i] > max_best_pending_count &&
-            workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
-            max_best_pending_count = numPendingUpdates[i];
-            best_slice_id = i;
         }
     }
     if (chooseBest) {
-        return best_slice_id;
+        int max_count = 0;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] > max_count &&
+                bestPendingUpdate[i] == best_update) {
+                max_count = numPendingUpdates[i];
+                ret_slice_id = i;
+            }
+        }
     } else {
-        return crowded_slice_id;
+        uint32_t best_value = -1;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] == max_pending_count &&
+                workload->betterThan(bestPendingUpdate[i], best_value)) {
+                best_value = bestPendingUpdate[i];
+                ret_slice_id = i;
+            }
+        }
     }
+    return ret_slice_id;
 }
 
 void

From 1205dba519601b9d922c1cc67e4f967b8ead8fff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:47:34 -0700
Subject: [PATCH 266/279] Fixing choosing next slice.

---
 src/accl/graph/sega/centeral_controller.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index b4492ff91b..dab48ee246 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -257,7 +257,8 @@ CenteralController::chooseNextSlice()
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
         }
-        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (numPendingUpdates[i] > 0 &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
         }
     }

From c949bab1704bccd77d41f1fb7c288fe29ecb49fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Apr 2023 17:39:56 -0700
Subject: [PATCH 267/279] Fixing sign extend issue when address is bigger than
 2GB.

---
 configs/accl/sega.py                       |  2 +-
 configs/accl/sega_simple.py                |  2 +-
 src/accl/graph/sega/centeral_controller.cc | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2df36fab20..17d84bd86c 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -154,7 +154,7 @@ def __init__(self, mirror_bw):
                 latency="0ns",
                 latency_var="0ns",
                 bandwidth=mirror_bw,
-                range=AddrRange(start=0, size="4GiB"),
+                range=AddrRange(start=0, size="16GiB"),
                 in_addr_map=False,
             ),
         )
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 516a7968a8..9d4177df94 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -149,7 +149,7 @@ def __init__(self, mirror_bw):
                 latency="0ns",
                 latency_var="0ns",
                 bandwidth=mirror_bw,
-                range=AddrRange(start=0, size="4GiB"),
+                range=AddrRange(start=0, size="16GiB"),
                 in_addr_map=False,
             ),
         )
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index dab48ee246..ec9199d194 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -296,14 +296,14 @@ CenteralController::processNextSliceSwitchEvent()
             continue;
         }
         int updates_generated = 0;
-        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
-        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
         PacketPtr end = createReadPacket(end_pointer, sizeof(int));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
-        Addr start_addr = start->getLE<int>();
-        Addr end_addr = end->getLE<int>();
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
         delete start;
         delete end;
         DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
@@ -375,14 +375,14 @@ CenteralController::processNextSliceSwitchEvent()
         if (src_id == currentSliceId) {
             continue;
         }
-        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
-        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
         PacketPtr end = createReadPacket(end_pointer, sizeof(int));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
-        Addr start_addr = start->getLE<int>();
-        Addr end_addr = end->getLE<int>();
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
         delete start;
         delete end;
 

From 5a0f7092dccb7fe61b84e2b8499b4607337558bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Apr 2023 19:07:10 -0700
Subject: [PATCH 268/279] Fixing the packet size issue.

---
 src/accl/graph/sega/centeral_controller.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index ec9199d194..a3a0b3854c 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -298,8 +298,8 @@ CenteralController::processNextSliceSwitchEvent()
         int updates_generated = 0;
         Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
         Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
-        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
-        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
         Addr start_addr = start->getLE<uint64_t>();
@@ -377,8 +377,8 @@ CenteralController::processNextSliceSwitchEvent()
         }
         Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
         Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
-        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
-        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
         Addr start_addr = start->getLE<uint64_t>();

From 40497bbc08e8008d3c60ff195954dda3a42c756d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Apr 2023 14:25:42 -0700
Subject: [PATCH 269/279] Fixing overflow issue with counting the number of
 bytes.

---
 src/accl/graph/sega/centeral_controller.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index a3a0b3854c..781ffa6005 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -309,8 +309,8 @@ CenteralController::processNextSliceSwitchEvent()
         DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
                             currentSliceId, dst_id, start_addr, end_addr);
 
-        int num_bytes = end_addr - start_addr;
-        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        uint64_t num_bytes = end_addr - start_addr;
+        uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex);
         MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
@@ -386,8 +386,8 @@ CenteralController::processNextSliceSwitchEvent()
         delete start;
         delete end;
 
-        int num_bytes = end_addr - start_addr;
-        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        uint64_t num_bytes = end_addr - start_addr;
+        uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex);
         MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);

From 0d370e7b5672a1b95b1e989dbf4b93eb46456f29 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Jul 2023 15:09:10 -0700
Subject: [PATCH 270/279] Bunch of stuff

---
 configs/accl/bc.py                         |   1 +
 configs/accl/bfs.py                        |  18 +-
 configs/accl/sega.py                       |   5 +
 configs/accl/sega_double_simple.py         | 270 +++++++++++++++++++++
 configs/accl/sega_simple.py                |   2 +-
 src/accl/graph/sega/centeral_controller.cc |   2 +
 src/accl/graph/sega/wl_engine.hh           |   9 +
 7 files changed, 305 insertions(+), 2 deletions(-)
 create mode 100644 configs/accl/sega_double_simple.py

diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 56faeb3e4d..202ec9b8e6 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -99,6 +99,7 @@ def get_inputs():
     system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
+
     m5.instantiate()
 
     system.set_bsp_mode()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index e62719729c..431f843b04 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -71,6 +71,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--dsimple",
+        dest="dsimple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for both vertex and edge",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -101,6 +109,7 @@ def get_inputs():
         args.best,
         args.visited,
         args.simple,
+        args.dsimple,
         args.sample,
         args.verify,
     )
@@ -118,12 +127,19 @@ def get_inputs():
         best,
         visited,
         simple,
+        dsimple,
         sample,
         verify,
     ) = get_inputs()
 
     if simple:
+        if dsimple:
+            raise ValueError("Can only pass either of --simple or --dsimple")
         from sega_simple import SEGA
+    elif dsimple:
+        if simple:
+            raise ValueError("Can only pass either of --simple or --dsimple")
+        from sega_double_simple import SEGA
     else:
         from sega import SEGA
 
@@ -143,7 +159,7 @@ def get_inputs():
     else:
         system.set_async_mode()
 
-    system.create_pop_count_directory(64)
+    system.create_pop_count_directory(32)
     if visited:
         system.create_bfs_visited_workload(init_addr, init_value)
     else:
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 17d84bd86c..795089579a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -188,6 +188,8 @@ def __init__(
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
 
+        self._num_gpts = num_gpts
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
@@ -273,3 +275,6 @@ def create_bc_workload(self, init_addr, init_value):
 
     def print_answer(self):
         self.ctrl.controller.printAnswerToHostSimout()
+
+    def get_num_gpts(self):
+        return self._num_gpts
\ No newline at end of file
diff --git a/configs/accl/sega_double_simple.py b/configs/accl/sega_double_simple.py
new file mode 100644
index 0000000000..87f37ce269
--- /dev/null
+++ b/configs/accl/sega_double_simple.py
@@ -0,0 +1,270 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="32GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "9.6GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = SimpleMemory(
+            latency="90ns",
+            bandwidth="76.8GiB/s",
+            range=AddrRange(size),
+            in_addr_map=False,
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="32GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 9d4177df94..3d56e9b3ca 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -194,7 +194,7 @@ def __init__(
 
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("4GiB")
+            mem = EdgeMemory("16GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 781ffa6005..af8f6348e0 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -385,6 +385,8 @@ CenteralController::processNextSliceSwitchEvent()
         Addr end_addr = end->getLE<uint64_t>();
         delete start;
         delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            src_id, currentSliceId, start_addr, end_addr);
 
         uint64_t num_bytes = end_addr - start_addr;
         uint64_t num_mirrors = (end_addr - start_addr) / sizeof(MirrorVertex);
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index ad67f19cb5..78663ba19d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -148,5 +148,14 @@ class WLEngine : public BaseReduceEngine
     bool done();
 };
 
+// virtual AddrRangeList getAddrRanges() const;
+
+// protected:
+// virtual bool recvTimingReq(PacketPtr pkt);
+// virtual Tick recvAtomic(PacketPtr pkt);
+// virtual void recvFunctional(PacketPtr pkt);
+// virtual void recvRespRetry();
+
+
 }
 #endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__

From 9a6b65f4c92826acc256bb5fe72ea0a76fb8840b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Aug 2023 19:26:16 -0700
Subject: [PATCH 271/279] Fixing compilation issues after rebasing.

---
 src/accl/graph/sega/base_memory_engine.hh  |  2 +-
 src/accl/graph/sega/centeral_controller.hh |  3 +--
 src/accl/graph/sega/push_engine.hh         |  3 +--
 src/accl/graph/sega/wl_engine.hh           |  3 +--
 src/base/addr_range.hh                     |  1 +
 src/mem/hbm_ctrl.cc                        |  7 +----
 src/mem/hbm_ctrl.hh                        |  5 ++--
 src/mem/mem_ctrl.cc                        | 30 +---------------------
 src/mem/mem_ctrl.hh                        |  8 ------
 9 files changed, 10 insertions(+), 52 deletions(-)

diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index afe7fd0433..31e7d85bef 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -72,7 +72,7 @@ class BaseMemoryEngine : public ClockedObject
 
         public:
         MemPort(const std::string& name, BaseMemoryEngine* owner):
-            RequestPort(name, owner), owner(owner),
+            RequestPort(name), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index b5acff672d..d98d2b3727 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -57,8 +57,7 @@ class CenteralController : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, CenteralController* owner, PortID id):
-          RequestPort(name, owner),
-          owner(owner), blockedPacket(nullptr), _id(id)
+          RequestPort(name), owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7170d2d22e..aefbda3ba2 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -57,8 +57,7 @@ class PushEngine : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, PushEngine* owner, PortID id) :
-          RequestPort(name, owner),
-          owner(owner), blockedPacket(nullptr), _id(id)
+          RequestPort(name), owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 78663ba19d..b1a85a2465 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -56,8 +56,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner, PortID id):
-          ResponsePort(name, owner),
-          owner(owner), needSendRetryReq(false), _id(id)
+          ResponsePort(name), owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 526db62606..b314bfefe1 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -748,6 +748,7 @@ class AddrRange
             return AddrRange(0, 0);
         }
         return AddrRange(start, end);
+    }
 
     friend AddrRange
     mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 9f6d81243d..6e7a0761d5 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,8 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
-    pchBit(p.pch_bit),
-    retryRdReqPC1(false), retryWrReqPC1(false),
+    retryRdReqPC1(false), retryWrReqPC1(false), pchBit(p.pch_bit),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
                          name()),
@@ -289,11 +288,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
             if (readQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
-<<<<<<< HEAD
                 retryRdReq = true;
-=======
-                MemCtrl::retryRdReq = true;
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
                 stats.numRdRetry++;
                 return false;
             } else {
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index 348152bf31..657851eaa6 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,14 +72,15 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-    // Position of the pseudochannel bit in addresses.
-    int pchBit;
+
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */
     bool retryRdReqPC1;
     bool retryWrReqPC1;
 
+    int pchBit;
+
     /**
      * Remove commands that have already issued from rowBurstTicks
      * and colBurstTicks
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 1f55a3f33d..9a3600f331 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -211,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        // stats.readPktSize[ceilLog2(size)]++;
+        stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 
@@ -767,11 +767,7 @@ MemCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_cmds_per_burst,
 }
 
 bool
-<<<<<<< HEAD
 MemCtrl::inReadBusState(bool next_state, const MemInterface* mem_intr) const
-=======
-MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 {
     // check the bus state
     if (next_state) {
@@ -784,11 +780,7 @@ MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const
 }
 
 bool
-<<<<<<< HEAD
 MemCtrl::inWriteBusState(bool next_state, const MemInterface* mem_intr) const
-=======
-MemCtrl::inWriteBusState(bool next_state, MemInterface* mem_intr) const
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 {
     // check the bus state
     if (next_state) {
@@ -909,24 +901,14 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     if (switched_cmd_type) {
         if (mem_intr->busState == MemCtrl::READ) {
             DPRINTF(MemCtrl,
-<<<<<<< HEAD
             "Switching to writes after %d reads with %d reads "
             "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
-=======
-                    "Switching to writes after %d reads with %d reads "
-                    "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
             stats.rdPerTurnAround.sample(mem_intr->readsThisTime);
             mem_intr->readsThisTime = 0;
         } else {
             DPRINTF(MemCtrl,
-<<<<<<< HEAD
             "Switching to reads after %d writes with %d writes "
             "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
-=======
-                    "Switching to reads after %d writes with %d writes "
-                    "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
             stats.wrPerTurnAround.sample(mem_intr->writesThisTime);
             mem_intr->writesThisTime = 0;
         }
@@ -1055,12 +1037,8 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
             // Also ensure that we've issued a minimum defined number
             // of reads before switching, or have emptied the readQ
             if ((mem_intr->writeQueueSize > writeHighThreshold) &&
-<<<<<<< HEAD
                (mem_intr->readsThisTime >= minReadsPerSwitch ||
                mem_intr->readQueueSize == 0)
-=======
-               (mem_intr->readsThisTime >= minReadsPerSwitch || mem_intr->readQueueSize == 0)
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
                && !(nvmWriteBlock(mem_intr))) {
                 switch_to_writes = true;
             }
@@ -1449,14 +1427,8 @@ MemCtrl::drain()
 {
     // if there is anything in any of our internal queues, keep track
     // of that as well
-<<<<<<< HEAD
     if (totalWriteQueueSize || totalReadQueueSize || !respQEmpty() ||
           !allIntfDrained()) {
-=======
-    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQEmpty() &&
-          allIntfDrained())) {
-
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
         DPRINTF(Drain, "Memory controller not drained, write: %d, read: %d,"
                 " resp: %d\n", totalWriteQueueSize, totalReadQueueSize,
                 respQueue.size());
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index 2de4184a5a..917798ffa7 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -762,11 +762,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a read state
      */
-<<<<<<< HEAD
     bool inReadBusState(bool next_state, const MemInterface* mem_intr) const;
-=======
-    bool inReadBusState(bool next_state, MemInterface* mem_intr) const;
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 
     /**
      * Check the current direction of the memory channel
@@ -774,11 +770,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a write state
      */
-<<<<<<< HEAD
     bool inWriteBusState(bool next_state, const MemInterface* mem_intr) const;
-=======
-    bool inWriteBusState(bool next_state, MemInterface* mem_intr) const;
->>>>>>> mem: HBMCtrl changes to allow PC data buses to be in different states
 
     Port &getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;

From 8f49979788a019752e8849d39eb02240637f3153 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Aug 2023 20:03:47 -0700
Subject: [PATCH 272/279] Cleaning up the config files

---
 .../sega_detailed.py}                         | 31 +++++++++++-----
 .../accl/{ => archived}/sega_double_simple.py |  0
 configs/accl/async-pr.py                      | 20 +---------
 configs/accl/bc.py                            | 20 +---------
 configs/accl/bfs.py                           | 37 +------------------
 configs/accl/cc.py                            | 20 +---------
 configs/accl/pr.py                            | 22 ++---------
 configs/accl/sega.py                          | 34 +++++------------
 configs/accl/sssp.py                          | 22 ++---------
 9 files changed, 46 insertions(+), 160 deletions(-)
 rename configs/accl/{sega_simple.py => archived/sega_detailed.py} (91%)
 rename configs/accl/{ => archived}/sega_double_simple.py (100%)

diff --git a/configs/accl/sega_simple.py b/configs/accl/archived/sega_detailed.py
similarity index 91%
rename from configs/accl/sega_simple.py
rename to configs/accl/archived/sega_detailed.py
index 3d56e9b3ca..795089579a 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/archived/sega_detailed.py
@@ -43,7 +43,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
                 intlvMatch=i,
             )
         )
-    return ret
+    return ret, intlv_low_bit + intlv_bits - 1
 
 
 class GPT(SubSystem):
@@ -75,8 +75,9 @@ def __init__(self, register_file_size: int, cache_size: str):
             update_queue_size=64,
         )
 
-        self.vertex_mem_ctrl = SimpleMemory(
-            latency="120ns", bandwidth="256GiB/s"
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(),
+            dram_2=HBM_2000_4H_1x64(),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -104,8 +105,12 @@ def getEdgeMemPort(self):
     def setEdgeMemPort(self, port):
         self.push_engine.mem_port = port
 
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
 
 
 class EdgeMemory(SubSystem):
@@ -183,6 +188,8 @@ def __init__(
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
 
+        self._num_gpts = num_gpts
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
@@ -194,18 +201,21 @@ def __init__(
 
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
         # Building the GPTs
-        vertex_ranges = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
         )
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -265,3 +275,6 @@ def create_bc_workload(self, init_addr, init_value):
 
     def print_answer(self):
         self.ctrl.controller.printAnswerToHostSimout()
+
+    def get_num_gpts(self):
+        return self._num_gpts
\ No newline at end of file
diff --git a/configs/accl/sega_double_simple.py b/configs/accl/archived/sega_double_simple.py
similarity index 100%
rename from configs/accl/sega_double_simple.py
rename to configs/accl/archived/sega_double_simple.py
diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
index 0bfb6caeaa..a24f2ff860 100644
--- a/configs/accl/async-pr.py
+++ b/configs/accl/async-pr.py
@@ -29,24 +29,16 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -68,12 +60,10 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
         args.alpha,
         args.threshold,
-        args.simple,
         args.sample,
         args.verify,
     )
@@ -82,21 +72,15 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
         alpha,
         threshold,
-        simple,
         sample,
         verify,
     ) = get_inputs()
 
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 202ec9b8e6..c100068aa2 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -29,24 +29,16 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -68,12 +60,10 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
         args.init_value,
-        args.simple,
         args.sample,
         args.verify,
     )
@@ -82,21 +72,15 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
         init_addr,
         init_value,
-        simple,
         sample,
         verify,
     ) = get_inputs()
 
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
 
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 431f843b04..6c33c93f59 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -29,12 +29,12 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
@@ -63,22 +63,6 @@ def get_inputs():
         default=False,
         help="Use visitation version of BFS",
     )
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
-    argparser.add_argument(
-        "--dsimple",
-        dest="dsimple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for both vertex and edge",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -100,7 +84,6 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
@@ -108,8 +91,6 @@ def get_inputs():
         args.tile,
         args.best,
         args.visited,
-        args.simple,
-        args.dsimple,
         args.sample,
         args.verify,
     )
@@ -118,7 +99,6 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
         init_addr,
@@ -126,24 +106,11 @@ def get_inputs():
         tile,
         best,
         visited,
-        simple,
-        dsimple,
         sample,
         verify,
     ) = get_inputs()
 
-    if simple:
-        if dsimple:
-            raise ValueError("Can only pass either of --simple or --dsimple")
-        from sega_simple import SEGA
-    elif dsimple:
-        if simple:
-            raise ValueError("Can only pass either of --simple or --dsimple")
-        from sega_double_simple import SEGA
-    else:
-        from sega import SEGA
-
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, cache_size, graph)
     if tile:
         system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
 
diff --git a/configs/accl/cc.py b/configs/accl/cc.py
index 9b6d2b587d..0fd4fe3505 100644
--- a/configs/accl/cc.py
+++ b/configs/accl/cc.py
@@ -29,22 +29,14 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -66,10 +58,8 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
-        args.simple,
         args.sample,
         args.verify,
     )
@@ -78,19 +68,13 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
-        simple,
         sample,
         verify,
     ) = get_inputs()
 
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 569514eb82..723f122908 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -29,26 +29,18 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("--num_nodes", type=int, default=1)
     argparser.add_argument("--error_threshold", type=float, default=0.0)
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -70,14 +62,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
         args.alpha,
         args.num_nodes,
         args.error_threshold,
-        args.simple,
         args.sample,
         args.verify,
     )
@@ -86,25 +76,19 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
         iterations,
         alpha,
         num_nodes,
         error_threshold,
-        simple,
         sample,
         verify,
     ) = get_inputs()
 
     print(f"error_threshold: {error_threshold}")
 
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
@@ -112,6 +96,7 @@ def get_inputs():
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
     system.create_pr_workload(num_nodes, alpha)
+    iteration = 0
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -124,7 +109,6 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        iteration = 0
         while iteration < iterations:
             exit_event = m5.simulate()
             print(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 795089579a..bd3ffe567f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -43,7 +43,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
                 intlvMatch=i,
             )
         )
-    return ret, intlv_low_bit + intlv_bits - 1
+    return ret
 
 
 class GPT(SubSystem):
@@ -75,9 +75,8 @@ def __init__(self, register_file_size: int, cache_size: str):
             update_queue_size=64,
         )
 
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(),
-            dram_2=HBM_2000_4H_1x64(),
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="256GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -105,12 +104,8 @@ def getEdgeMemPort(self):
     def setEdgeMemPort(self, port):
         self.push_engine.mem_port = port
 
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
 
 
 class EdgeMemory(SubSystem):
@@ -179,7 +174,6 @@ class SEGA(System):
     def __init__(
         self,
         num_gpts,
-        num_registers,
         cache_size,
         graph_path,
     ):
@@ -188,8 +182,6 @@ def __init__(
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
 
-        self._num_gpts = num_gpts
-
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
@@ -198,24 +190,21 @@ def __init__(
 
         self.ctrl = SEGAController("256GiB/s")
         self.ctrl.set_vertices_image(f"{graph_path}/vertices")
-
+        num_registers = 128
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("4GiB")
+            mem = EdgeMemory("16GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
         # Building the GPTs
-        vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
         )
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
-            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.set_vertex_range(vertex_ranges[i])
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -275,6 +264,3 @@ def create_bc_workload(self, init_addr, init_value):
 
     def print_answer(self):
         self.ctrl.controller.printAnswerToHostSimout()
-
-    def get_num_gpts(self):
-        return self._num_gpts
\ No newline at end of file
diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py
index f2e60b856a..e23ebfb365 100644
--- a/configs/accl/sssp.py
+++ b/configs/accl/sssp.py
@@ -29,24 +29,16 @@
 import argparse
 
 from m5.objects import *
+from sega import SEGA
 
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -68,12 +60,10 @@ def get_inputs():
 
     return (
         args.num_gpts,
-        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
         args.init_value,
-        args.simple,
         args.sample,
         args.verify,
     )
@@ -82,21 +72,15 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
-        num_registers,
         cache_size,
         graph,
         init_addr,
         init_value,
-        simple,
         sample,
         verify,
     ) = get_inputs()
-
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()

From ee1058c3b1ae0420d96f7b85c9d67a58e441ad0e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 21 Aug 2023 08:01:32 -0700
Subject: [PATCH 273/279] Fixing the memory controller segmentation fault

---
 src/accl/graph/sega/push_engine.hh | 3 ++-
 src/mem/mem_ctrl.cc                | 8 ++++----
 src/mem/mem_ctrl.hh                | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index aefbda3ba2..5d139e4c98 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -57,7 +57,8 @@ class PushEngine : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, PushEngine* owner, PortID id) :
-          RequestPort(name), owner(owner), blockedPacket(nullptr), _id(id)
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 9a3600f331..b43c1b8366 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -211,7 +211,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 
@@ -1213,8 +1213,8 @@ MemCtrl::CtrlStats::CtrlStats(MemCtrl &_ctrl)
     ADD_STAT(numWrRetry, statistics::units::Count::get(),
              "Number of times write queue was full causing retry"),
 
-    ADD_STAT(readPktSize, statistics::units::Count::get(),
-             "Read request sizes (log2)"),
+    // ADD_STAT(readPktSize, statistics::units::Count::get(),
+    //          "Read request sizes (log2)"),
     ADD_STAT(writePktSize, statistics::units::Count::get(),
              "Write request sizes (log2)"),
 
@@ -1286,7 +1286,7 @@ MemCtrl::CtrlStats::regStats()
     avgRdQLen.precision(2);
     avgWrQLen.precision(2);
 
-    readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
+    // readPktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
     writePktSize.init(ceilLog2(ctrl.system()->cacheLineSize()) + 1);
 
     rdQLenPdf.init(ctrl.readBufferSize);
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index 917798ffa7..d33724e327 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -581,7 +581,7 @@ class MemCtrl : public qos::MemCtrl
 
         statistics::Scalar numRdRetry;
         statistics::Scalar numWrRetry;
-        statistics::Vector readPktSize;
+        // statistics::Vector readPktSize;
         statistics::Vector writePktSize;
         statistics::Vector rdQLenPdf;
         statistics::Vector wrQLenPdf;

From 276dc58b328a2b5232623296d54c740b6fb89615 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Aug 2023 09:02:11 -0700
Subject: [PATCH 274/279] Adding a stat to count the number of activations.

---
 src/accl/graph/sega/coalesce_engine.cc | 5 +++++
 src/accl/graph/sega/coalesce_engine.hh | 1 +
 2 files changed, 6 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5e0c8c8095..afb0695206 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -680,6 +680,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
+
+    stats.vertexActivations += active ? 1 : 0;
     if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) {
         cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!numActiveBlocksNow.find(block_index))) {
@@ -1258,6 +1260,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce):
              "Number of times memory bandwidth was not available."),
     ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
              "Number of bytes read that were not used by coalesce engine"),
+    ADD_STAT(vertexActivations, statistics::units::Count::get(),
+             "Number of times a vertex has become active. "
+             "Only meaningful in async mode"),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3a9e463595..b7e3821dd7 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -183,6 +183,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar wastefulBytesRead;
+        statistics::Scalar vertexActivations;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;

From 86777275dc56839ddefb4ed3906ab0f39486867d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 16 May 2024 16:29:04 -0700
Subject: [PATCH 275/279] Enable edge mem init through central controller.

---
 src/accl/graph/sega/CenteralController.py  | 50 ++++++++++++++--------
 src/accl/graph/sega/centeral_controller.cc | 24 +++++++++++
 src/accl/graph/sega/centeral_controller.hh |  1 +
 3 files changed, 56 insertions(+), 19 deletions(-)

diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index d40998d584..4d13bee13c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -28,17 +28,21 @@
 from m5.params import *
 from m5.proxy import *
 from m5.util.pybind import PyBindMethod
+from m5.objects.AbstractMemory import ABstractMemory
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
+
 class CenteralController(BaseMemoryEngine):
-    type = 'CenteralController'
+    type = "CenteralController"
     cxx_header = "accl/graph/sega/centeral_controller.hh"
-    cxx_class = 'gem5::CenteralController'
+    cxx_class = "gem5::CenteralController"
 
     mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
 
-    choose_best = Param.Bool("Whether to prefer the best update "
-                            "value for choosing the next slice")
+    choose_best = Param.Bool(
+        "Whether to prefer the best update "
+        "value for choosing the next slice"
+    )
 
     vertex_image_file = Param.String("Path to the vertex image file.")
 
@@ -46,19 +50,27 @@ class CenteralController(BaseMemoryEngine):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
+    edge_image_file = Param.String("Path to the edge image file.")
+    abstract_mem_vector = VectorParam.AbstractMemory(
+        "Abstract Memories to be intialized by edge_image_file."
+    )
+    abstract_mem_atom_size = Param.Int(
+        64, "burst size of the abstract memories."
+    )
+
     cxx_exports = [
-                    PyBindMethod("setAsyncMode"),
-                    PyBindMethod("setBSPMode"),
-                    PyBindMethod("setPGMode"),
-                    PyBindMethod("createPopCountDirectory"),
-                    PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createBFSVisitedWorkload"),
-                    PyBindMethod("createSSSPWorkload"),
-                    PyBindMethod("createCCWorkload"),
-                    PyBindMethod("createAsyncPRWorkload"),
-                    PyBindMethod("createPRWorkload"),
-                    PyBindMethod("createBCWorkload"),
-                    PyBindMethod("workCount"),
-                    PyBindMethod("getPRError"),
-                    PyBindMethod("printAnswerToHostSimout")
-                ]
+        PyBindMethod("setAsyncMode"),
+        PyBindMethod("setBSPMode"),
+        PyBindMethod("setPGMode"),
+        PyBindMethod("createPopCountDirectory"),
+        PyBindMethod("createBFSWorkload"),
+        PyBindMethod("createBFSVisitedWorkload"),
+        PyBindMethod("createSSSPWorkload"),
+        PyBindMethod("createCCWorkload"),
+        PyBindMethod("createAsyncPRWorkload"),
+        PyBindMethod("createPRWorkload"),
+        PyBindMethod("createBCWorkload"),
+        PyBindMethod("workCount"),
+        PyBindMethod("getPRError"),
+        PyBindMethod("printAnswerToHostSimout"),
+    ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index af8f6348e0..8c2da94de9 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -31,10 +31,12 @@
 #include <cmath>
 #include <iostream>
 
+#include "base/addr_range_map.hh"
 #include "base/cprintf.hh"
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
+#include "mem/abstract_mem.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -182,6 +184,28 @@ CenteralController::startup()
         }
     }
     workload->iterate();
+
+    const auto& edge_file = params().edge_image_file;
+    if (edge_file == "") {}
+        return;
+
+    AddrRangeMap<AbstractMemory*> abs_mem_range_map;
+    for (auto abs_mem: params().abstract_mem_vector) {
+        for (auto range: abs_mem->getAddrRanges()) {
+            abs_mem_range_map.insert(range, abs_mem);
+        }
+    }
+    auto* edge_object = loader::createObjectFile(edge_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), edge_file);
+
+    loader::debugSymbolTable.insert(*edge_object->symtab().globals());
+    loader::MemoryImage edge_image = edge_object->buildImage();
+
+    PortProxy edge_proxy(
+    [this](PacketPtr pkt) {
+        auto routing_entry = abs_mem_range_map.contains(pkt->getAddr());
+        routing_entry->second->functionalAccess(pkt);
+    }, params().abstract_mem_atom_size);
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index d98d2b3727..206db65aff 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -38,6 +38,7 @@
 #include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
+#include "base/addr_range_map.hh"
 #include "base/intmath.hh"
 #include "mem/simple_mem.hh"
 #include "params/CenteralController.hh"

From 412915f061265c081932ece3bd66a1f13313faa8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 16 May 2024 16:30:17 -0700
Subject: [PATCH 276/279] Adding actual write call.

---
 src/accl/graph/sega/centeral_controller.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 8c2da94de9..8a641e4f94 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -206,6 +206,7 @@ CenteralController::startup()
         auto routing_entry = abs_mem_range_map.contains(pkt->getAddr());
         routing_entry->second->functionalAccess(pkt);
     }, params().abstract_mem_atom_size);
+    panic_if(!edge_image.write(edge_proxy), "%s: Unable to write image.");
 }
 
 void

From c3c2d6137f0a758b6a64ff898e3210786b72477a Mon Sep 17 00:00:00 2001
From: Will Shaddix <wshaddix@ucdavis.edu>
Date: Thu, 20 Jun 2024 20:41:52 +0000
Subject: [PATCH 277/279] ready to push changes

---
 configs/accl/disagg_bfs.py  | 170 ++++++++++++++++++++++
 configs/accl/disagg_sega.py | 272 ++++++++++++++++++++++++++++++++++++
 2 files changed, 442 insertions(+)
 create mode 100644 configs/accl/disagg_bfs.py
 create mode 100644 configs/accl/disagg_sega.py

diff --git a/configs/accl/disagg_bfs.py b/configs/accl/disagg_bfs.py
new file mode 100644
index 0000000000..42e1c06acb
--- /dev/null
+++ b/configs/accl/disagg_bfs.py
@@ -0,0 +1,170 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+from disagg_sega import SEGA
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument("fend", type=int)
+    argparser.add_argument("mem_ctrl_lat", type=str)
+
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+    
+    
+        
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.tile,
+        args.best,
+        args.visited,
+        args.sample,
+        args.verify,
+        args.fend,
+        args.mem_ctrl_lat,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        tile,
+        best,
+        visited,
+        sample,
+        verify,
+        fend,
+        mem_ctrl_lat,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph,fend, mem_ctrl_lat)
+    if tile:
+        system.set_aux_images(f"{graph}/mirrors", f"{graph}/mirrors_map")
+
+    if best:
+        system.set_choose_best(True)
+
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
+    system.create_pop_count_directory(32)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(743598075)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
+                break
+            elif exit_event.getCause() == "no update left to process.":
+                break
+    else:
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
+            if exit_event.getCause() == "no update left to process.":
+                break
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/disagg_sega.py b/configs/accl/disagg_sega.py
new file mode 100644
index 0000000000..f3b4c44c31
--- /dev/null
+++ b/configs/accl/disagg_sega.py
@@ -0,0 +1,272 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            examine_window=8,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=1024,
+            examine_window=12,
+            max_propagates_per_cycle=8,
+            update_queue_size=64,
+            base_addr=4294967296,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="120ns", bandwidth="28GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port 
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, num_channels: str, fend_latency: int, mem_ctrl_lat: int):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        print("fend_latency is: ", fend_latency)
+        print("mem_ctrl_lat is: ", mem_ctrl_lat)
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=fend_latency, forward_latency=1, response_latency=1
+        )
+        
+        addr_ranges = interleave_addresses(AddrRange(start="4GiB", size="64GiB"), num_channels, 64) # in_addr_map was False
+        self.mem_ctrls = [MemCtrl(dram=DDR4_2400_8x8(range=addr_ranges[i], in_addr_map=True), static_frontend_latency=mem_ctrl_lat, port=self.xbar.mem_side_ports) for i in range(num_channels)]
+        [print(f"{self.mem_ctrls[i]} range is: {addr_ranges[i]}") for i in range(num_channels)]
+    def get_abs_mems(self):
+        return [ctrl.dram for ctrl in self.mem_ctrls]
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start="0GiB", size="16GiB"),# was 16 GiB
+                in_addr_map=False,
+            ),
+            edge_base = 0,
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+    
+    def set_edges_image(self, edges):
+        self.controller.edge_image_file = edges
+    
+    def set_abs_mems(self, abs_mems):
+        self.controller.abstract_mem_vector = abs_mems
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
+class SEGA(System):
+    def __init__(
+        self,
+        num_gpts,
+        cache_size,
+        graph_path,
+        fend_latency,
+        mem_ctrl_lat,
+    ):
+        super(SEGA, self).__init__()
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+        
+        self.edge_mem = EdgeMemory(4, fend_latency, mem_ctrl_lat)
+        
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+        self.ctrl.set_edges_image(f"{graph_path}/edgelist_0")
+        self.ctrl.set_abs_mems(self.edge_mem.get_abs_mems())
+        num_registers = 128
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32 # was 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(self.edge_mem.getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+
+    def work_count(self):
+        return self.ctrl.controller.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.controller.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.controller.createCCWorkload()
+
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.controller.printAnswerToHostSimout()

From 62bd78efb95a85e57e9a8249710ebcf1e24a7f80 Mon Sep 17 00:00:00 2001
From: Will Shaddix <wshaddix@ucdavis.edu>
Date: Thu, 20 Jun 2024 20:44:55 +0000
Subject: [PATCH 278/279] Enabling disagg sega

---
 src/accl/graph/sega/CenteralController.py  |  7 +++-
 src/accl/graph/sega/PushEngine.py          |  2 ++
 src/accl/graph/sega/centeral_controller.cc | 33 ++++++++++++-----
 src/accl/graph/sega/centeral_controller.hh |  2 ++
 src/accl/graph/sega/mpu.hh                 |  4 ++-
 src/accl/graph/sega/push_engine.cc         |  2 ++
 src/accl/graph/sega/push_engine.hh         |  2 +-
 src/python/m5/SimObject.py                 | 41 ++++++++++++++++++++--
 src/python/m5/params.py                    |  3 ++
 9 files changed, 82 insertions(+), 14 deletions(-)

diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 4d13bee13c..0cdd11d251 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -28,7 +28,7 @@
 from m5.params import *
 from m5.proxy import *
 from m5.util.pybind import PyBindMethod
-from m5.objects.AbstractMemory import ABstractMemory
+from m5.objects.AbstractMemory import AbstractMemory
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 
@@ -51,12 +51,17 @@ class CenteralController(BaseMemoryEngine):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     edge_image_file = Param.String("Path to the edge image file.")
+    
     abstract_mem_vector = VectorParam.AbstractMemory(
         "Abstract Memories to be intialized by edge_image_file."
     )
     abstract_mem_atom_size = Param.Int(
         64, "burst size of the abstract memories."
     )
+    
+    edge_base = Param.UInt64("Addr of base address range")
+    
+    
 
     cxx_exports = [
         PyBindMethod("setAsyncMode"),
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 2174f943f4..50e240808e 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -52,3 +52,5 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
+    
+    base_addr = Param.UInt64("Addr of base address range")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 8a641e4f94..a2970a9013 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,11 +43,14 @@
 namespace gem5
 {
 
+using memory::AbstractMemory;
+
 CenteralController::CenteralController(const Params& params):
     BaseMemoryEngine(params),
     mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
     mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
     chooseBest(params.choose_best),
+    edgeBase(params.edge_base),
     nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()),
     stats(*this)
 {
@@ -139,6 +142,7 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
 void
 CenteralController::startup()
 {
+    DPRINTF(CenteralController, "Startup 1!\n");
     unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
         for (auto range: mpu->getAddrRanges()) {
@@ -147,10 +151,12 @@ CenteralController::startup()
         mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
+        DPRINTF(CenteralController, "Startup 2!\n");
 
     const auto& vertex_file = params().vertex_image_file;
     if (vertex_file == "")
         return;
+    DPRINTF(CenteralController, "Startup 3!\n");
 
     auto* object = loader::createObjectFile(vertex_file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
@@ -161,6 +167,7 @@ CenteralController::startup()
 
     int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem));
     numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice);
+    DPRINTF(CenteralController, "Startup 4!\n");
 
     numPendingUpdates = new int [numTotalSlices];
     bestPendingUpdate = new uint32_t [numTotalSlices];
@@ -168,6 +175,7 @@ CenteralController::startup()
         numPendingUpdates[i] = 0;
         bestPendingUpdate[i] = -1;
     }
+    DPRINTF(CenteralController, "Startup 5!\n");
 
     PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
@@ -184,28 +192,37 @@ CenteralController::startup()
         }
     }
     workload->iterate();
-
+    DPRINTF(CenteralController, "Startup 6!\n");
+    DPRINTF(CenteralController, "params().edge_image_file = %s\n", params().edge_image_file);
     const auto& edge_file = params().edge_image_file;
-    if (edge_file == "") {}
-        return;
+    DPRINTF(CenteralController, "edge_file = %s\n", edge_file);
+
+    // if (edge_file == "") {} // commented this out
+    //     return;
+
+    DPRINTF(CenteralController, "Startup 7!\n");
 
     AddrRangeMap<AbstractMemory*> abs_mem_range_map;
-    for (auto abs_mem: params().abstract_mem_vector) {
-        for (auto range: abs_mem->getAddrRanges()) {
-            abs_mem_range_map.insert(range, abs_mem);
-        }
+    for (auto abs_mem: params().abstract_mem_vector) { 
+        abs_mem_range_map.insert(abs_mem->getAddrRange(), abs_mem);
     }
+    // DPRINTF(CenteralController, "%s, Edge memory ranges: %s", __func__, abs_mem_range_map);
     auto* edge_object = loader::createObjectFile(edge_file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), edge_file);
 
     loader::debugSymbolTable.insert(*edge_object->symtab().globals());
     loader::MemoryImage edge_image = edge_object->buildImage();
+    DPRINTF(CenteralController, "Startup 8!\n");
 
     PortProxy edge_proxy(
-    [this](PacketPtr pkt) {
+    [abs_mem_range_map, this](PacketPtr pkt) {
+        pkt->setAddr(pkt->getAddr() + mpuVector[0]->getBaseAddr());
         auto routing_entry = abs_mem_range_map.contains(pkt->getAddr());
         routing_entry->second->functionalAccess(pkt);
     }, params().abstract_mem_atom_size);
+
+    DPRINTF(CenteralController, "%s, mpuVector[0]->getBaseAddr(): %lu", __func__, mpuVector[0]->getBaseAddr());
+
     panic_if(!edge_image.write(edge_proxy), "%s: Unable to write image.");
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 206db65aff..74f8124380 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,6 +74,8 @@ class CenteralController : public BaseMemoryEngine
 
     memory::SimpleMemory* mirrorsMem;
 
+    // AddrRangeMap<gem5::AbstractMemory*> abs_mem_range_map; // moved here from .cc file
+    Addr edgeBase;
     std::vector<MPU*> mpuVector;
     AddrRangeMap<MPU*> mpuAddrMap;
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 4afb2081ca..7d6d7d4003 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -57,7 +57,7 @@ class MPU : public SimObject
     WLEngine* wlEngine;
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
-
+    
   public:
     PARAMS(MPU);
     MPU(const Params& params);
@@ -95,6 +95,8 @@ class MPU : public SimObject
 
     void recvDoneSignal();
     bool done();
+
+    uint64_t getBaseAddr() {return pushEngine->params().base_addr;};
 };
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index eea41448da..ea6c6e47b5 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -50,6 +50,7 @@ PushEngine::PushEngine(const Params& params):
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
+    base(params.base_addr), 
     stats(*this)
 {
     destinationQueues.clear();
@@ -256,6 +257,7 @@ PushEngine::processNextMemoryReadEvent()
     EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
     Tick entrance_tick = std::get<1>(edgePointerQueue.front());
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    aligned_addr += base;
     if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5d139e4c98..b46ce6e3ed 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -128,7 +128,7 @@ class PushEngine : public BaseMemoryEngine
     Tick lastIdleEntranceTick;
 
     AddrRangeList localAddrRange;
-
+    Addr base;
     int numPendingPulls;
     int edgePointerQueueSize;
     std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
diff --git a/src/python/m5/SimObject.py b/src/python/m5/SimObject.py
index 08105d8833..cdfd0f367a 100644
--- a/src/python/m5/SimObject.py
+++ b/src/python/m5/SimObject.py
@@ -1048,6 +1048,7 @@ def find_any(self, ptype):
     def find_all(self, ptype):
         all = {}
         # search children
+        # print(f"I {self._name} am self at entry.")
         for child in self._children.values():
             # a child could be a list, so ensure we visit each item
             if isinstance(child, list):
@@ -1064,34 +1065,54 @@ def find_all(self, ptype):
                     all[child] = True
                 if isSimObject(child):
                     # also add results from the child itself
+                    # print(f"I {self._name} am child to {self._parent}")
+                    # print(f"Me children are {self._children}")
+                    # print(f"Me looking for {ptype}")
                     child_all, done = child.find_all(ptype)
+                    # print(f"My ({self._name}) children are {child_all}")
                     all.update(dict(zip(child_all, [done] * len(child_all))))
         # search param space
         for pname, pdesc in self._params.items():
             if issubclass(pdesc.ptype, ptype):
                 match_obj = self._values[pname]
                 if not isproxy(match_obj) and not isNullPointer(match_obj):
-                    all[match_obj] = True
+                    # print(f"I {match_obj} be match_object") # maybe we can  either make the DRAM interfaces children?
+                    # print(type(match_obj)) #                  or we can maybe check if isSimObjectVector, then serialize it
+                    # print(f" here is all: {type(all)} {all}")
+                    if type(match_obj) is SimObjectVector:
+                        # print("sim object vector!!!")
+                        for simobj in match_obj:
+                            print(simobj)
+                            all[simobj] = True
+                    else:
+                        all[match_obj] = True
+                    # print(f"post all is true")
         # Also make sure to sort the keys based on the objects' path to
         # ensure that the order is the same on all hosts
+        # print(f"I {self._name} am self at exit.")
         return sorted(all.keys(), key=lambda o: o.path()), True
 
     def unproxy(self, base):
         return self
 
     def unproxyParams(self):
+        print(f"Me be {self._name} at the entry of unproxyParams.")
         for param in self._params.keys():
             value = self._values.get(param)
+            print(f"me value is {value}")
+
             if value != None and isproxy(value):
                 try:
+                    print(f"me type im trying to unproxy is {type(value)}")
                     value = value.unproxy(self)
                 except:
+                    print(f"Me be {param} when hit error")
                     print(
                         f"Error in unproxying param '{param}' of {self.path()}"
                     )
                     raise
                 setattr(self, param, value)
-
+        print(f"Me be {self._name} at the exit of unproxyParams.")
         # Unproxy ports in sorted order so that 'append' operations on
         # vector ports are done in a deterministic fashion.
         port_names = list(self._ports.keys())
@@ -1190,7 +1211,21 @@ def getCCParams(self):
                     self.path(),
                     param,
                 )
-
+            if (not isinstance(value, EthernetAddr)) and isproxy(value):
+                # At the time of adding this error unproxying params happens
+                # in simulate.py at lines 103-104 (commit hash: f56459470a)
+                # To understand how attributes are handled for SimObjects
+                # refer to SimObject::__setattr__.
+                fatal(
+                    f"Param {param} for {self._name} has value = {value}. "
+                    "This value is a not a valid value. This could be caused "
+                    f"by {param} not having been unproxied correctly. "
+                    "One reason why this might happen is if you have "
+                    "mistakenly added a child SimObject as an attr and not a "
+                    "child by giving it a name that starts with an underscore "
+                    f"`_`. {self.path()} should not say 'orphan.'"
+                )
+                
             value = value.getValue()
             if isinstance(self._params[param], VectorParamDesc):
                 assert isinstance(value, list)
diff --git a/src/python/m5/params.py b/src/python/m5/params.py
index 2ca6dfcc14..65f87ff3df 100644
--- a/src/python/m5/params.py
+++ b/src/python/m5/params.py
@@ -260,11 +260,14 @@ def getValue(self):
         return [v.getValue() for v in self]
 
     def unproxy(self, base):
+        print(f" me Unproxying vector param, me is {self}")
         if len(self) == 1 and isinstance(self[0], proxy.BaseProxy):
+            print(f"me in if params.py, about to try to unproxy {self[0]}")
             # The value is a proxy (e.g. Parent.any, Parent.all or
             # Parent.x) therefore try resolve it
             return self[0].unproxy(base)
         else:
+            print("me in else params.py")
             return [v.unproxy(base) for v in self]
 
 

From 240ad42aa19dfb9bb7433443f6ed9667378c99a7 Mon Sep 17 00:00:00 2001
From: Will Shaddix <wshaddix@ucdavis.edu>
Date: Sat, 22 Jun 2024 01:17:41 +0000
Subject: [PATCH 279/279] added stats to track round trip edge memory lat and
 outstandingEdgeMemReqs

---
 src/accl/graph/sega/push_engine.cc | 28 ++++++++++++++++++++++++++--
 src/accl/graph/sega/push_engine.hh |  8 ++++++++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ea6c6e47b5..6173fd67e8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,7 +42,8 @@ PushEngine::PushEngine(const Params& params):
     _running(false),
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
-    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    onTheFlyMemReqs(0), outstandingEdgeReqs(0), maxOutstandingEdgeReqs(0),
+    edgeQueueSize(params.resp_queue_size),
     examineWindow(params.examine_window),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     updateQueueSize(params.update_queue_size),
@@ -267,8 +268,14 @@ PushEngine::processNextMemoryReadEvent()
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
+        reqTickMap[pkt->req] = curTick(); //added this for stats
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
+        outstandingEdgeReqs++;
+        maxOutstandingEdgeReqs = std::max(maxOutstandingEdgeReqs, outstandingEdgeReqs);
+        stats.maxOutstandingEdgeRequests = maxOutstandingEdgeReqs;
+        stats.outstandingEdgeRequests.sample(outstandingEdgeReqs);
+        // stats.outstandingEdgeRequests.sample(onTheFlyMemReqs);
 
         curr_info.iterate();
         if (curr_info.done()) {
@@ -308,9 +315,12 @@ PushEngine::handleMemResp(PacketPtr pkt)
 {
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
+    // here check tick and update the stats
+    // DPRINTF(PushEngine, "%s: Received a memory response.\n", __func__);
 
     uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
+    Tick entrance_tick = reqTickMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
     for (int i = 0; i < push_info.numElements; i++) {
@@ -327,6 +337,10 @@ PushEngine::handleMemResp(PacketPtr pkt)
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
     onTheFlyMemReqs -= push_info.numElements;
+    outstandingEdgeReqs--;
+    stats.avgEdgeAccessLatency.sample(
+            (curTick() - entrance_tick));
+    reqTickMap.erase(pkt->req); // added this for edgeAcccessLatency
     reqInfoMap.erase(pkt->req);
 
     delete pkt;
@@ -547,7 +561,13 @@ PushEngine::PushStats::PushStats(PushEngine& _push):
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
     ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
-             "Histogram of number of propagates sent.")
+             "Histogram of number of propagates sent."),
+    ADD_STAT(avgEdgeAccessLatency, statistics::units::Second::get(),
+             "Histogram of edgeAccessLatency."),
+    ADD_STAT(outstandingEdgeRequests, statistics::units::Count::get(),
+             "Histogram of the size of the outstanding edge requests."),
+    ADD_STAT(maxOutstandingEdgeRequests, statistics::units::Count::get(),
+             "Histogram of the size of the outstanding edge requests.")
 {
 }
 
@@ -564,6 +584,10 @@ PushEngine::PushStats::regStats()
     edgeQueueLength.init(64);
     updateQueueLength.init(64);
     numPropagatesHist.init(1 + push.params().max_propagates_per_cycle);
+
+    // need to check what these init values mean 
+    avgEdgeAccessLatency.init(64);
+    outstandingEdgeRequests.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index b46ce6e3ed..41fb6391cd 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,8 +133,12 @@ class PushEngine : public BaseMemoryEngine
     int edgePointerQueueSize;
     std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
+    std::unordered_map<RequestPtr, Tick> reqTickMap;
+
 
     int onTheFlyMemReqs;
+    int outstandingEdgeReqs;
+    int maxOutstandingEdgeReqs;
     int edgeQueueSize;
     int examineWindow;
     int maxPropagatesPerCycle;
@@ -187,6 +191,10 @@ class PushEngine : public BaseMemoryEngine
       statistics::Histogram edgeQueueLength;
       statistics::Histogram updateQueueLength;
       statistics::Histogram numPropagatesHist;
+
+      statistics::Histogram avgEdgeAccessLatency;
+      statistics::Histogram outstandingEdgeRequests;
+      statistics::Scalar maxOutstandingEdgeRequests = 0;
     };
 
     PushStats stats;