From 0f3fc73c4df1158760d8959d7d66d3eb1265fd9e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Jan 2022 15:52:52 -0800
Subject: [PATCH 001/287] accl: Adding src code for PushEngine.

---
 src/accl/push_engine.hh | 69 +++++++++++++++++++++++++++++++++++++++++
 1 file changed, 69 insertions(+)
 create mode 100644 src/accl/push_engine.hh
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
new file mode 100644
index 0000000000..eda9d7b707
--- /dev/null
+++ b/src/accl/push_engine.hh
@@ -0,0 +1,69 @@
+#ifndef __ACCL_PUSH_ENGINE_HH__
+#define __ACCL_PUSH_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/PushEngine.hh"
+#include "sim/clocked_object.hh"
+
+class PushEngine : public ClockedObject
+{
+  private:
+
+    class PushRespPort : public ResponsePort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class PushReqPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class PushMemPort : public RequestPort
+    {
+      private:
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        PushMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    PushRespPort respPort;
+    PushReqPort reqPort;
+    PushMemPort memPort;
+
+    std::queue<PacketPtr> vertexQueue;
+    std::queue<PacketPtr> updateQueue;
+
+    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+
+};
+
+#endif // __ACCL_PUSH_ENGINE_HH__

From 0dd0beb81d3910a313bb97c0c0dd1489e9f567ae Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Feb 2022 17:56:49 -0800
Subject: [PATCH 002/287] Adding implementation for PushEngine (wip).

---
 src/accl/push_engine.cc | 120 ++++++++++++++++++++++++++++++++++++++++
 src/accl/push_engine.hh |  63 ++++++++++++++++++++-
 src/accl/util.cc        |  16 ++++++
 src/accl/util.hh        |   4 ++
 4 files changed, 200 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/push_engine.cc
 create mode 100644 src/accl/util.cc
 create mode 100644 src/accl/util.hh

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
new file mode 100644
index 0000000000..bc3138f61e
--- /dev/null
+++ b/src/accl/push_engine.cc
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/push_engine.hh"
+
+#include "debug/PushEngine.hh"
+
+PushEngine::PushEngine(const PushEngineParams& params):
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
+    vertexQueueSize(params.vertex_queue_size),
+    vertexQueueLen(0),
+    updateQueue(params.update_queue_size),
+    updateQueueLen(0),
+    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
+    nextSendEvent([this]{ processNextSendEvent(); }, name())
+{}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleUpdate(pkt);
+}
+
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
+{
+    if (vertexQueueLen < vertexQueueSize) {
+        vertexQueue.push(pkt)
+        vertexQueueLen++;
+        return true;
+
+        if (!nextReceiveEvent.scheduled()){
+            schedule(nextReceiveEvent, nextCycle());
+        }
+    }
+    return false;
+}
+
+void
+PushEngine::processNextReceiveEvent()
+{
+    PacketPtr updatePkt = vertexQueue.pop();
+    uint8_t* data = updatePkt->getData<uint8_t>();
+
+    Addr edgeListAddr = ; // TODO: Generalize finding this address.
+    int outDegree = ; // TODO: Generalize finding this value.
+
+    Addr reqAddr = (edgeListAddr / 64) * 64;
+    Addr offsetAddr = edgeListAddr % 64;
+
+    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+
+    memPort.sendPacket(pkt);
+
+
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+
+}
+
+void
+PushEngine::processNextCreateEvent()
+{
+
+}
+
+void
+PushEngine::processNextSendEvent()
+{
+
+}
\ No newline at end of file
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index eda9d7b707..6ab902d0e2 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -1,8 +1,35 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #ifndef __ACCL_PUSH_ENGINE_HH__
 #define __ACCL_PUSH_ENGINE_HH__
 
 #include <queue>
-#include <unordered_map>
 
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
@@ -10,6 +37,7 @@
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
 #include "sim/clocked_object.hh"
+#include "sim/system.hh"
 
 class PushEngine : public ClockedObject
 {
@@ -18,6 +46,7 @@ class PushEngine : public ClockedObject
     class PushRespPort : public ResponsePort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -55,14 +84,42 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
-    PushRespPort respPort;
+    System* const system;
+    const RequestorID requestorId;
+
     PushReqPort reqPort;
+    PushRespPort respPort;
+
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
+    int vertexQueueSize;
+    int vertexQueueLen;
+
     std::queue<PacketPtr> updateQueue;
+    int updateQueueSize;
+    int updateQueueLen;
+
+    EventFunctionWrapper nextReceiveEvent;
+    void processNextReceiveEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextCreateEvent;
+    void processNextCreateEvent();
+
+    EventFunctionWrapper nextSendEvent;
+    void processNextSendEvent();
+
+    bool handleUpdate(PacketPtr pkt);
+
+  public:
+
+    PushEngine(const PushEngineParams &params);
 
-    std::pair<Addr, int> interpretPackPtr(PacketPtr pkt);
+    Port &getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 
 };
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
new file mode 100644
index 0000000000..20abd1c13a
--- /dev/null
+++ b/src/accl/util.cc
@@ -0,0 +1,16 @@
+#include "accl/util.hh"
+
+PacketPtr
+getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
new file mode 100644
index 0000000000..c621b9e45c
--- /dev/null
+++ b/src/accl/util.hh
@@ -0,0 +1,4 @@
+#include "mem/packet.hh"
+
+PacketPtr getReadPacket(Addr addr, unsigned int size);
+

From 3b359ade313c989b465a5879d738096526cbf6c4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:36:08 -0800
Subject: [PATCH 003/287] Adding util source code.

---
 src/accl/util.cc | 28 ++++++++++++++++++++++++++++
 src/accl/util.hh | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
 2 files changed, 75 insertions(+), 1 deletion(-)

diff --git a/src/accl/util.cc b/src/accl/util.cc
index 20abd1c13a..8d975c482f 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -1,3 +1,31 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
 #include "accl/util.hh"
 
 PacketPtr
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c621b9e45c..18b8e4c197 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -1,4 +1,50 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "base/types.hh"
 #include "mem/packet.hh"
 
-PacketPtr getReadPacket(Addr addr, unsigned int size);
+struct WorkListItem
+{
+    uint32_t temp_prop;
+    uint32_t prop;
+    uint32_t degree;
+    Addr edgeList;
+}
+
+struct Edge
+{
+    uint32_t weight;
+    Addr neighbor;
+}
+
+WorkListItem& memoryToWorkList(uint8_t* data);
+Edge& memoryToEdge(uint8_t* data);
 
+PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);

From f74e9df55bafd83ea180ad6b9db91840f0e3b9e5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 31 Jan 2022 11:34:07 -0800
Subject: [PATCH 004/287] Adding the first version of Apply engine

---
 src/accl/apply.cc | 129 ++++++++++++++++++++++++++++++++++++++++++++
 src/accl/apply.hh | 132 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 src/accl/apply.cc
 create mode 100644 src/accl/apply.hh

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
new file mode 100644
index 0000000000..d0e2b712a6
--- /dev/null
+++ b/src/accl/apply.cc
@@ -0,0 +1,129 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/apply.h"
+
+#include <string>
+
+
+typedef std::pair<PacketPtr, PortID> ReqPair;
+typedef std::pair<uint64_t, PortID> QueuePair;
+
+Apply::Apply(const ApplyParams &params):
+    ClockedObject(params),
+    nextApplyEvent([this]{processNextApplyEvent; }, name()),
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
+    queueSize(params.applyQueueSize) //add this to .py
+{
+    applyReadQueue(queueSize);
+    pplyWriteQueue(queueSize);
+}
+
+bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleWL(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool Apply::handleWL(PacketPtr pkt){
+    auto queue = applyReadQueue;
+    if (queue->blocked()){
+        sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void Apply::processNextApplyCheckEvent(){
+    auto queue = applyReadQueue;
+    memPort = ApplyMemPort
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret)
+            break;
+    }
+
+}
+
+virtual bool
+Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+bool
+Apply::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+    //check pkt (temp_prop != prop)
+    if (temp_prop != prop){
+        //update prop with temp_prop
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(pkt);
+
+        if(!nextApplyEvent.scheduled()){
+            schedule(nextApplyEvent, nextCycle());
+        }
+        return true;
+    }
+    return true;
+}
+
+
+
+void
+Apply::processNextApplyEvent(){
+    auto queue = applyWriteQueue;
+    memPort = ApplyMemPort;
+    pushPort = ApplyReqPort;
+    while(!queue.empty()){
+        auto pkt = queue.pop()
+        /// conver to ReadReq
+        bool ret = memPort->sendPacket(pkt);
+        bool push = pushPort->sendPacket(pkt);
+        // handel responsehere
+        if (!ret || !push)
+            break;
+
+    }
+
+}
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
new file mode 100644
index 0000000000..2ae593a1cb
--- /dev/null
+++ b/src/accl/apply.hh
@@ -0,0 +1,132 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class Apply : public ClockedObject
+{
+  private:
+
+    class ApplyRespPort : public ResponsePort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ApplyRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+    }
+
+    class ApplyReqPort : public RequestPort
+    {
+      private:
+        APPLY *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        struct ApplyQueue{
+          std::queue<PacketPtr> applyQueue;
+          const uint_32 queueSize;
+          bool sendPktRetry;
+
+          bool blocked(){
+            return applyQueue.size() == queueSize;
+          }
+          bool empty(){
+            return applyQueue.empty();
+          }
+          void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+          }
+
+          ApplyQueue(uint32_t qSize):
+            queueSize(qSize){}
+        };
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual bool recvTimingResp(PacketPtr pkt);
+    }
+
+    class ApplyMemPort : public RequestPort
+    {
+      private:
+        Apply *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        ApplyReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        bool sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+
+    }
+    bool handleWL(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readApplyBuffer();
+    bool handleMemResp(PacktPtr resp);
+    void writePushBuffer();
+
+
+    //Events
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    ApplyQueue applyQueue;
+    ApplyMemPort memPort;
+   public(const ApplyParams &apply);
+};
+
+#endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 7945cf333644c9ad0f0e5dfb99e8040d3944785d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 5 Feb 2022 20:34:12 -0800
Subject: [PATCH 005/287] Portotyping memory interface

---
 src/accl/apply.cc | 36 ++++++++++++++++++++++--------------
 src/accl/apply.hh |  8 +++++---
 2 files changed, 27 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d0e2b712a6..b0ef5e8513 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -46,7 +46,7 @@ Apply::Apply(const ApplyParams &params):
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!owner->handleWL(pkt)){
+    if (!this->handleWL(pkt)){
         return false;
     }
     return true;
@@ -73,7 +73,9 @@ void Apply::processNextApplyCheckEvent(){
     while(!queue.empty()){
         auto pkt = queue.pop()
         /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        bool ret = memPort->sendPacket(memPkt);
         // handel responsehere
         if (!ret)
             break;
@@ -84,27 +86,24 @@ void Apply::processNextApplyCheckEvent(){
 virtual bool
 Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    return this->handleMemResp(pkt);
 }
 
 bool
 Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
-    //check pkt (temp_prop != prop)
-    if (temp_prop != prop){
-        //update prop with temp_prop
+
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(pkt);
+            queue->push(writePkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
         }
         return true;
-    }
     return true;
 }
 
@@ -117,12 +116,21 @@ Apply::processNextApplyEvent(){
     pushPort = ApplyReqPort;
     while(!queue.empty()){
         auto pkt = queue.pop()
-        /// conver to ReadReq
-        bool ret = memPort->sendPacket(pkt);
-        bool push = pushPort->sendPacket(pkt);
-        // handel responsehere
-        if (!ret || !push)
-            break;
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* prop = data;
+        uint32_t* temp_prop = prop + 1;
+        if (*temp_prop != *prop){
+            //update prop with temp_prop
+            *prop = min(*prop , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            bool ret = memPort->sendPacket(pkt);
+            bool push = pushPort->sendPacket(pkt);
+            // handel response here
+            if (!ret || !push)
+                break;
+        }
 
     }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 2ae593a1cb..e9c27a1fcf 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -61,7 +61,7 @@ class Apply : public ClockedObject
     class ApplyReqPort : public RequestPort
     {
       private:
-        APPLY *owner;
+        Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -124,9 +124,11 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    ApplyQueue applyQueue;
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
     ApplyMemPort memPort;
-   public(const ApplyParams &apply);
+    std::pair<Addr, int>
+   public(const ApplyParams &apply);  //fix this
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 14426cddc9527e56cf96cb15d7382199e4309e98 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 12:04:02 -0800
Subject: [PATCH 006/287] [wip] Improving the implementation. Adding address
 range, python params.

---
 src/accl/Apply.py |  39 ++++++++++++
 src/accl/apply.cc | 153 +++++++++++++++++++++++++++++++++++-----------
 src/accl/apply.hh |  42 ++++++++++---
 3 files changed, 191 insertions(+), 43 deletions(-)
 create mode 100644 src/accl/Apply.py

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
new file mode 100644
index 0000000000..01c627d4c8
--- /dev/null
+++ b/src/accl/Apply.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class Apply(ClockedObject):
+    type = 'Apply'
+    cxx_header = "accl/apply.hh"
+    cxx_class = 'gem5::Apply'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b0ef5e8513..d605537033 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -26,22 +26,41 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.h"
+#include "accl/apply.hh"
 
 #include <string>
 
-
-typedef std::pair<PacketPtr, PortID> ReqPair;
-typedef std::pair<uint64_t, PortID> QueuePair;
-
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
     nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
     queueSize(params.applyQueueSize) //add this to .py
 {
     applyReadQueue(queueSize);
-    pplyWriteQueue(queueSize);
+    applyWriteQueue(queueSize);
+}
+
+Port &
+Apply::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+Apply::ApplyRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
@@ -52,6 +71,65 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+
+virtual bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+Apply::ApplyMemPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+Apply::ApplyRequestPort::recvReqRetry()
+{
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+AddrRangeList
+Apply::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue->blocked()){
@@ -59,34 +137,29 @@ bool Apply::handleWL(PacketPtr pkt){
         return false;
     } else
         queue->push(pkt);
-
     if(!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    memPort = ApplyMemPort
     while(!queue.empty()){
-        auto pkt = queue.pop()
-        /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        bool ret = memPort->sendPacket(memPkt);
-        // handel responsehere
-        if (!ret)
-            break;
+        if(!memPort->blocked()){
+            auto pkt = queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                    respPort->trySendRetry();
+                    queue->sendPktRetry = false;
+            }
+            // conver to ReadReq
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+            memPort->sendPacket(memPkt);
+        }
+        else
+            return;
     }
-
-}
-
-virtual bool
-Apply::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
 }
 
 bool
@@ -107,31 +180,39 @@ Apply::handleMemResp(PacktPtr pkt)
     return true;
 }
 
-
-
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    memPort = ApplyMemPort;
-    pushPort = ApplyReqPort;
     while(!queue.empty()){
-        auto pkt = queue.pop()
+        auto pkt = queue.front();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* prop = data;
         uint32_t* temp_prop = prop + 1;
         if (*temp_prop != *prop){
             //update prop with temp_prop
             *prop = min(*prop , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
-            bool ret = memPort->sendPacket(pkt);
-            bool push = pushPort->sendPacket(pkt);
-            // handel response here
-            if (!ret || !push)
+            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
+                memPort->sendPacket(pkt);
+                applyReqPort->sendPacket(pkt);
+                queue.pop();
+                if(queue->sendPktRetry && !queue->blocked()){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
                 break;
         }
-
+        else{
+            queue.pop();
+            if(queue->sendPktRetry && !queue->blocked()){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+        }
     }
-
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e9c27a1fcf..fab4cf871a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -56,6 +56,7 @@ class Apply : public ClockedObject
 
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        void trySendRetry();
     }
 
     class ApplyReqPort : public RequestPort
@@ -64,7 +65,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-
         struct ApplyQueue{
           std::queue<PacketPtr> applyQueue;
           const uint_32 queueSize;
@@ -83,12 +83,19 @@ class Apply : public ClockedObject
           ApplyQueue(uint32_t qSize):
             queueSize(qSize){}
         };
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
+        void sendPacket(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
 
+      protected:
+        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class ApplyMemPort : public RequestPort
     {
@@ -96,13 +103,21 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
+    };
 
-    }
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -110,7 +125,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacktPtr resp);
     void writePushBuffer();
 
-
     //Events
     void processNextApplyCheckEvent();
     /* Syncronously checked
@@ -124,11 +138,25 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
+    void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+
+    void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
+
+    AddrRangeList getAddrRanges() const;
+
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
+
     ApplyMemPort memPort;
-    std::pair<Addr, int>
-   public(const ApplyParams &apply);  //fix this
+    ApplyRespPort respPort;
+    ApplyRequestPort reqPort;
+
+  public:
+    Apply(const ApplyParams &apply);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_APPLY_HH__
\ No newline at end of file

From 8e79d19e2028a80dda8aa7b2026a010310fec300 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:14:27 -0800
Subject: [PATCH 007/287] [wip] minor fixes to Apply engine

---
 src/accl/apply.cc |  8 ++++----
 src/accl/apply.hh | 44 +++++++++++++++++++++++---------------------
 2 files changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index d605537033..6ad630f0ac 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -96,7 +96,7 @@ WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 void
 Apply::ApplyMemPort::trySendRetry()
 {
-    sendRetryReq();
+    sendRetryResp();
 }
 
 void
@@ -108,7 +108,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
+WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -117,7 +117,7 @@ WLEngine::ApplyRequestPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyRequestPort::recvReqRetry()
+Apply::ApplyReqtPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -158,7 +158,7 @@ void Apply::processNextApplyCheckEvent(){
             memPort->sendPacket(memPkt);
         }
         else
-            return;
+            break;
     }
 }
 
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index fab4cf871a..dae3d8ec0e 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -43,11 +43,29 @@ class Apply : public ClockedObject
 {
   private:
 
+    struct ApplyQueue{
+      std::queue<PacketPtr> applyQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return applyQueue.size() == queueSize;
+      }
+      bool empty(){
+        return applyQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        applyQueue.push(pkt);
+      }
+
+      ApplyQueue(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -55,9 +73,11 @@ class Apply : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
         void trySendRetry();
-    }
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+    };
 
     class ApplyReqPort : public RequestPort
     {
@@ -65,24 +85,6 @@ class Apply : public ClockedObject
         Apply *owner;
         bool _blocked;
         PacketPtr blockedPacket;
-        struct ApplyQueue{
-          std::queue<PacketPtr> applyQueue;
-          const uint_32 queueSize;
-          bool sendPktRetry;
-
-          bool blocked(){
-            return applyQueue.size() == queueSize;
-          }
-          bool empty(){
-            return applyQueue.empty();
-          }
-          void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-          }
-
-          ApplyQueue(uint32_t qSize):
-            queueSize(qSize){}
-        };
 
       public:
         ApplyReqPort(const std::string& name, SimObject* _owner,

From 469a8f7f7897289d5295500f18e7a60e691123d0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 7 Feb 2022 12:26:01 -0800
Subject: [PATCH 008/287] Worklist engine implementation

---
 src/accl/wl_engine.cc | 185 ++++++++++++++++++++++++++++++++++++++++++
 src/accl/wl_engine.hh | 143 ++++++++++++++++++++++++++++++++
 2 files changed, 328 insertions(+)
 create mode 100644 src/accl/wl_engine.cc
 create mode 100644 src/accl/wl_engine.hh

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
new file mode 100644
index 0000000000..28f8a4fe11
--- /dev/null
+++ b/src/accl/wl_engine.cc
@@ -0,0 +1,185 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/wl_engine.hh"
+
+#include <string>
+
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    ClockedObject(params),
+    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
+    queueSize(params.wlQueueSize) //add this to .py
+{
+    wlReadQueue(queueSize);
+    wlWriteQueue(queueSize);
+}
+
+bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!this->handleWLUpdate(pkt)){
+        return false;
+    }
+    return true;
+}
+
+bool WLEngine::handleWLUpdate(PacketPtr pkt){
+    auto queue = wlReadQueue;
+    if (queue->blocked()){
+        queue->sendPktRetry = true;
+        return false;
+    } else
+        queue->push(pkt);
+
+    if(!nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
+}
+
+
+void WLEngine::processNextWLReadEvent(){
+    auto queue = wlReadQueue;
+    memPort = WLMemPort
+    while(!queue.empty()){ //create a map instead of front
+        auto pkt = queue.front()
+        /// conver to ReadReq
+        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        if (!memPort->blocked()){
+            memPort->sendPacket(memPkt);
+            break;
+        }
+    }
+
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+
+    owner->wakeUp(); //TODO
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+bool
+WLEngine::handleMemResp(PacktPtr pkt)
+{
+    auto queue = applyWriteQueue;
+        if (queue->blocked()){
+            sendPktRetry = true;
+            return false;
+        } else
+            queue->push(writePkt);
+
+        if(!nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+        }
+        return true;
+    return true;
+}
+
+void
+WLEngine::processNextWLReduceEvent(){
+    auto queue = wlWriteQueue;
+    auto updateQ = wlReadQueue;
+    memPort = WLMemPort;
+    applyPort = WLReqPort;
+    while(!queue.empty()){
+        auto update = updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            WLRespPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
+        auto pkt = queue.front()
+        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
+        uint64_t* data = pkt->getPtr<uint64_t>();
+        uint32_t* value = updatePtr;
+        uint32_t* temp_prop = prop + 1;
+        if (*value != *prop){
+            //update prop with temp_prop
+            *temp_prop = min(*value , *temp_prop);
+            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
+            writePkt->setData(data);
+            if (!memPort->blocked() && !applyPort->blocked()){
+                memPort->sendPacket(pkt);
+                applyPort->sendPacket(pkt);
+                queue.pop();
+                if (!queue->blocked() && queue->sendPktRetry){
+                    memPort->trySendRetry();
+                    queue->sendPktRetry = false;
+                }
+            }
+            else
+                break;
+        }
+        else{
+            queue.pop();
+            if (!queue->blocked() && queue->sendPktRetry){
+                memPort->trySendRetry();
+                queue->sendPktRetry = false;
+            }
+
+        }
+
+    }
+
+}
+
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
new file mode 100644
index 0000000000..7269965ff2
--- /dev/null
+++ b/src/accl/wl_engine.hh
@@ -0,0 +1,143 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLE_HH__
+#define __ACCL_WLE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range_map.hh"
+#include "base/statistics.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class WLEngine : public ClockedObject
+{
+  private:
+
+    struct WLQueue{
+      std::queue<PacketPtr> wlQueue;
+      const uint_32 queueSize;
+      bool sendPktRetry;
+
+      bool blocked(){
+        return wlQueue.size() == queueSize;
+      }
+      bool empty(){
+        return wlQueue.empty();
+      }
+      void push(PacketPtr pkt){
+        wlQueue.push(pkt);
+      }
+
+      WLReqPort(uint32_t qSize):
+        queueSize(qSize){}
+    };
+
+    class WLRespPort : public ResponsePort //From Push engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        WLRespPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+
+        virtual AddrRangeList getAddrRanges();
+        virtual bool recvTimingReq(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLReqPort : public RequestPort //To Apply Engine
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLReqPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void trySendRetry();
+        virtual bool recvTimingResp(PacketPtr pkt);
+        bool blocked(){
+          return _blocked;
+        }
+    }
+
+    class WLMemPort : public RequestPort
+    {
+      private:
+        WLEngine *owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+      public:
+        WLMemPort(const std::string& name, SimObject* _owner,
+              PortID id=InvalidPortID);
+        void sendPacket(PacktPtr pkt);
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void trySendRetry();
+        bool blocked(){
+          return _blocked;
+        }
+    }
+    bool handleWLU(PacketPtr pkt);
+    bool sendPacket();
+    //one queue for write and one for read a priotizes write over read
+    void readWLBuffer();
+    bool handleMemResp(PacktPtr resp);
+
+
+    //Events
+    void processNextWLReadEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    void processNextWLReduceEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    WLQueue wlReadQueue;
+    WLQueue wlWriteQueue;
+    WLMemPort memPort;
+    std::pair<Addr, int>
+   public:
+   WLEngine(const WLEngineParams &params);  //fix this
+};
+
+#endif // __ACCL_WLE_HH__
\ No newline at end of file

From af73e980a6f14878b8ad77fc6c4d7a649f3d2bcd Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 11 Feb 2022 13:06:32 -0800
Subject: [PATCH 009/287] [wip] Adding the python file to the WLE

---
 src/accl/WLEngine.py  |  39 ++++++++++++
 src/accl/wl_engine.cc | 138 ++++++++++++++++++++++++++++--------------
 src/accl/wl_engine.hh |  46 ++++++++++----
 3 files changed, 165 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/WLEngine.py

diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
new file mode 100644
index 0000000000..fe6b25b6ba
--- /dev/null
+++ b/src/accl/WLEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class WLEngine(ClockedObject):
+    type = 'WLEngine'
+    cxx_header = "accl/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    respPort = ResponsePort("Receives updates")
+    reqPort  = RequestPort("Sends requests to Apply")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 28f8a4fe11..fbf201720d 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,9 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    reqPort(name() + ".reqPort", this),
+    respPort(name() + ".respPort", this),
+    memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
@@ -41,6 +44,26 @@ WLEngine::WLEngine(const WLEngineParams &params):
     wlWriteQueue(queueSize);
 }
 
+Port &
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+AddrRangeList
+WLEngine::WLRespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
     if (!this->handleWLUpdate(pkt)){
@@ -49,6 +72,68 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
+void
+WLEngine::WLRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
+void
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+virtual bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return this->handleMemResp(pkt);
+}
+
+void
+WLEngine::WLMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
+void
+WLEngine::WLReqPort::recvReqRetry()
+{
+    // We should have a blocked packet if this function is called.
+    assert(_blocked && blockedPacket != nullptr);
+    _blocked = false;
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+}
+
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+AddrRangeList
+WLEngine::getAddrRanges() const
+{
+    return memPort.getAddrRanges();
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = wlReadQueue;
     if (queue->blocked()){
@@ -63,14 +148,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-
 void WLEngine::processNextWLReadEvent(){
     auto queue = wlReadQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
-        RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+        RequestPtr req =
+            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         if (!memPort->blocked()){
             memPort->sendPacket(memPkt);
@@ -80,37 +165,10 @@ void WLEngine::processNextWLReadEvent(){
 
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-WLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-
-    owner->wakeUp(); //TODO
-}
-
-virtual bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return this->handleMemResp(pkt);
-}
-
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = applyWriteQueue;
+    auto queue = wlWriteQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -128,12 +186,11 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = wlWriteQueue;
     auto updateQ = wlReadQueue;
-    memPort = WLMemPort;
-    applyPort = WLReqPort;
+    applyPort = reqPort;
     while(!queue.empty()){
         auto update = updateQ.pop()
         if (!updateQ->blocked() & updateQ->sendPktRetry){
-            WLRespPort->trySendRetry();
+            respPort->trySendRetry();
             updateQ->sendPktRetry = false;
         }
         auto pkt = queue.front()
@@ -144,7 +201,8 @@ WLEngine::processNextWLReduceEvent(){
         if (*value != *prop){
             //update prop with temp_prop
             *temp_prop = min(*value , *temp_prop);
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            RequestPtr req =
+                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
             PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
             writePkt->setData(data);
             if (!memPort->blocked() && !applyPort->blocked()){
@@ -171,15 +229,3 @@ WLEngine::processNextWLReduceEvent(){
     }
 
 }
-
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
\ No newline at end of file
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7269965ff2..3f39ec7ee8 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -66,7 +66,6 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
@@ -74,11 +73,11 @@ class WLEngine : public ClockedObject
               PortID id=InvalidPortID);
 
         virtual AddrRangeList getAddrRanges();
+        void trySendRetry();
+
+      protected:
         virtual bool recvTimingReq(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
-    }
+    };
 
     class WLReqPort : public RequestPort //To Apply Engine
     {
@@ -86,15 +85,19 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void trySendRetry();
-        virtual bool recvTimingResp(PacketPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked(){
           return _blocked;
         }
-    }
+
+      protected:
+        void recvReqRetry() override;
+        virtual bool recvTimingResp(PacketPtr pkt);
+    };
 
     class WLMemPort : public RequestPort
     {
@@ -102,16 +105,21 @@ class WLEngine : public ClockedObject
         WLEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
+
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
         void sendPacket(PacktPtr pkt);
-        virtual bool recvTimingResp(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
         }
-    }
+
+    protected:
+      virtual bool recvTimingResp(PacketPtr pkt);
+      void recvReqRetry() override;
+    };
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -131,13 +139,27 @@ class WLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+    void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
+
+    void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
+
+    AddrRangeList getAddrRanges() const;
 
     WLQueue wlReadQueue;
     WLQueue wlWriteQueue;
     WLMemPort memPort;
-    std::pair<Addr, int>
+
+    WLMemPort memPort;
+    WLRespPort respPort;
+    WLRequestPort reqPort;
+
    public:
-   WLEngine(const WLEngineParams &params);  //fix this
+
+    WLEngine(const WLEngineParams &params);
+    Port &getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 #endif // __ACCL_WLE_HH__
\ No newline at end of file

From 23e3f42ae186681dedf173e0b42a20bd6b918ab2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 13:06:45 -0800
Subject: [PATCH 010/287] Changing some small errors

---
 src/accl/wl_engine.cc | 19 ++++++++++++-------
 1 file changed, 12 insertions(+), 7 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index fbf201720d..e49ad44bf1 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -162,7 +162,6 @@ void WLEngine::processNextWLReadEvent(){
             break;
         }
     }
-
 }
 
 bool
@@ -188,12 +187,8 @@ WLEngine::processNextWLReduceEvent(){
     auto updateQ = wlReadQueue;
     applyPort = reqPort;
     while(!queue.empty()){
-        auto update = updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
-        }
-        auto pkt = queue.front()
+        auto update = updateQ.front();
+        auto pkt = queue.front();
         uint64_t* updatePtr = pkt->getPtr<uint64_t>();
         uint64_t* data = pkt->getPtr<uint64_t>();
         uint32_t* value = updatePtr;
@@ -213,6 +208,11 @@ WLEngine::processNextWLReduceEvent(){
                     memPort->trySendRetry();
                     queue->sendPktRetry = false;
                 }
+                updateQ.pop();
+                if (!updateQ->blocked() & updateQ->sendPktRetry){
+                    respPort->trySendRetry();
+                    updateQ->sendPktRetry = false;
+                }
             }
             else
                 break;
@@ -223,6 +223,11 @@ WLEngine::processNextWLReduceEvent(){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
+            updateQ.pop()
+            if (!updateQ->blocked() & updateQ->sendPktRetry){
+                respPort->trySendRetry();
+                updateQ->sendPktRetry = false;
+            }
 
         }
 

From 495fc758be9b02fa2e4d8187c57d486c70aa78e3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Feb 2022 17:39:58 -0800
Subject: [PATCH 011/287] [wip] using util in the creating memory  packets

---
 src/accl/apply.cc     | 69 ++++++++++++++++++------------
 src/accl/apply.hh     |  6 +++
 src/accl/util.cc      | 43 +++++++++++++++++++
 src/accl/util.hh      |  3 +-
 src/accl/wl_engine.cc | 97 ++++++++++++++++++++++++-------------------
 src/accl/wl_engine.hh | 10 ++++-
 6 files changed, 155 insertions(+), 73 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6ad630f0ac..6b474d5628 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -32,6 +32,8 @@
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -145,20 +147,25 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    while(!queue.empty()){
-        if(!memPort->blocked()){
-            auto pkt = queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                    respPort->trySendRetry();
-                    queue->sendPktRetry = false;
-            }
-            // conver to ReadReq
-            RequestPtr req = std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-            memPort->sendPacket(memPkt);
+    if(!memPort->blocked()){
+        auto pkt = queue.pop();
+        if(queue->sendPktRetry && !queue->blocked()){
+                respPort->trySendRetry();
+                queue->sendPktRetry = false;
         }
-        else
-            break;
+        // conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
+        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
+        memPort->sendPacket(memPkt);
+    }
+    else{
+        break;
+    }
+    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+        schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
@@ -183,21 +190,27 @@ Apply::handleMemResp(PacktPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-    while(!queue.empty()){
         auto pkt = queue.front();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* prop = data;
-        uint32_t* temp_prop = prop + 1;
-        if (*temp_prop != *prop){
-            //update prop with temp_prop
-            *prop = min(*prop , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !reqPort->blocked()){ //re-think this
-                memPort->sendPacket(pkt);
-                applyReqPort->sendPacket(pkt);
+        uint8_t* data = pkt->getPtr<uint8_t>();
+
+        RequestPtr req = pkt->req;
+        int request_offset = requestOffset[req];
+        WorkListItem wl = memoryToWorkList(data + request_offset);
+        uint32_t prop = wl.prop;
+        uint32_t temp_prop = wl.temp_prop;
+
+        if (temp_prop != prop){
+            if (!memPort->blocked() && !reqPort->blocked()){
+                //update prop with temp_prop
+                wl.prop = min(prop , temp_prop);
+                //write back the new worklist item to  memory
+                uint8_t* wList = workListToMemory(wl);
+                memcpy(data + request_offset, wList, sizeof(WorkListItem));
+                //Create memory write requests.
+                PacketPtr writePkt  =
+                getWritePacket(pkt->getAddr(), 64, data, requestorId);
+                memPort->sendPacket(writePkt);
+                applyReqPort->sendPacket(writePkt);
                 queue.pop();
                 if(queue->sendPktRetry && !queue->blocked()){
                     memPort->trySendRetry();
@@ -214,5 +227,7 @@ Apply::processNextApplyEvent(){
                 queue->sendPktRetry = false;
             }
         }
+    if(!queue.empty() && !nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
     }
 }
\ No newline at end of file
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index dae3d8ec0e..b213d37667 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -146,6 +147,9 @@ class Apply : public ClockedObject
     void processNextApplyCheckEvent();
     EventFunctionWrapper nextApplyCheckEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+
     AddrRangeList getAddrRanges() const;
 
     ApplyQueue applyReadQueue;
@@ -155,6 +159,8 @@ class Apply : public ClockedObject
     ApplyRespPort respPort;
     ApplyRequestPort reqPort;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
+
   public:
     Apply(const ApplyParams &apply);
     Port &getPort(const std::string &if_name,
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8d975c482f..8debd3a937 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -42,3 +42,46 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 
     return pkt;
 }
+
+PacketPtr getWritePacket(Addr addr,
+               unsigned int size,
+               uint8_t* data,
+               RequestorID requestorId)
+{
+    equestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+unit8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
+    uint_8* data = new uint8_t [data_size];
+    uint_32* wList = (uint_32*)data;
+    *wList = wl.prop;
+    *wList + 1 = wl.temp_prop;
+    *wList + 2 = wl.degree;
+    *wList + 3 = wl.edgeIndex;
+
+    return data;
+}
\ No newline at end of file
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 18b8e4c197..00ccb7ddd9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -34,7 +34,7 @@ struct WorkListItem
     uint32_t temp_prop;
     uint32_t prop;
     uint32_t degree;
-    Addr edgeList;
+    uint32_t edgeIndex;
 }
 
 struct Edge
@@ -44,6 +44,7 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
+unit8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index e49ad44bf1..7d6d707ae6 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -33,6 +33,8 @@
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
@@ -40,8 +42,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
     queueSize(params.wlQueueSize) //add this to .py
 {
-    wlReadQueue(queueSize);
-    wlWriteQueue(queueSize);
+    updateQueue(queueSize);
+    responseQueue(queueSize);
 }
 
 Port &
@@ -135,7 +137,7 @@ WLEngine::getAddrRanges() const
 }
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     if (queue->blocked()){
         queue->sendPktRetry = true;
         return false;
@@ -149,25 +151,32 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 }
 
 void WLEngine::processNextWLReadEvent(){
-    auto queue = wlReadQueue;
+    auto queue = updateQueue;
     memPort = WLMemPort
     while(!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
+        Addr req_addr = (pkt->getAddr() / 64) * 64;
+        int req_offset = (pkt->getAddr()) % 64;
         RequestPtr req =
-            std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
+            std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        requestOffset[req] = req_offset;
         if (!memPort->blocked()){
+            queue.pop()
             memPort->sendPacket(memPkt);
             break;
         }
     }
+    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+        schedule(nextWLReadEvent, nextCycle());
+    }
 }
 
 bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
-    auto queue = wlWriteQueue;
+    auto queue = responseQueue;
         if (queue->blocked()){
             sendPktRetry = true;
             return false;
@@ -183,54 +192,56 @@ WLEngine::handleMemResp(PacktPtr pkt)
 
 void
 WLEngine::processNextWLReduceEvent(){
-    auto queue = wlWriteQueue;
-    auto updateQ = wlReadQueue;
+    auto queue = responseQueue;
+    auto updateQ = updateQueue;
     applyPort = reqPort;
-    while(!queue.empty()){
-        auto update = updateQ.front();
-        auto pkt = queue.front();
-        uint64_t* updatePtr = pkt->getPtr<uint64_t>();
-        uint64_t* data = pkt->getPtr<uint64_t>();
-        uint32_t* value = updatePtr;
-        uint32_t* temp_prop = prop + 1;
-        if (*value != *prop){
-            //update prop with temp_prop
-            *temp_prop = min(*value , *temp_prop);
-            RequestPtr req =
-                std::make_shared<Request>(pkt->getAddr(), 64, 0 ,0);
-            PacketPtr writePkt = new Packet(req, MemCmd::WriteReq);
-            writePkt->setData(data);
-            if (!memPort->blocked() && !applyPort->blocked()){
-                memPort->sendPacket(pkt);
-                applyPort->sendPacket(pkt);
-                queue.pop();
-                if (!queue->blocked() && queue->sendPktRetry){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
-                }
-                updateQ.pop();
-                if (!updateQ->blocked() & updateQ->sendPktRetry){
-                    respPort->trySendRetry();
-                    updateQ->sendPktRetry = false;
-                }
-            }
-            else
-                break;
-        }
-        else{
+    auto update = updateQ.front();
+    auto value = update->getPtr<uint8_t>();
+    auto pkt = queue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
+    RequestPtr req = pkt->req;
+    int request_offset = requestOffset[req];
+    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    uint32_t temp_prop = wl.temp_prop;
+    if (temp_prop != *value){
+        //update prop with temp_prop
+        temp_prop = min(value , temp_prop);
+        if (!memPort->blocked() && !applyPort->blocked()){
+            wl.temp_prop = temp_prop;
+            unit8_t* wlItem = workListToMemory(wl);
+            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            memPort->sendPacket(writePkt);
+            applyPort->sendPacket(writePkt);
             queue.pop();
             if (!queue->blocked() && queue->sendPktRetry){
                 memPort->trySendRetry();
                 queue->sendPktRetry = false;
             }
-            updateQ.pop()
+            updateQ.pop();
             if (!updateQ->blocked() & updateQ->sendPktRetry){
                 respPort->trySendRetry();
                 updateQ->sendPktRetry = false;
             }
-
         }
-
+        else
+            break;
     }
+    else{
+        queue.pop();
+        if (!queue->blocked() && queue->sendPktRetry){
+            memPort->trySendRetry();
+            queue->sendPktRetry = false;
+        }
+        updateQ.pop()
+        if (!updateQ->blocked() & updateQ->sendPktRetry){
+            respPort->trySendRetry();
+            updateQ->sendPktRetry = false;
+        }
 
+    }
+    if(!queue && !nextWLReduceEvent.scheduled()){
+            schedule(nextWLReduceEvent, nextCycle());
+    }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 3f39ec7ee8..7132283463 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
 #include "mem/port.hh"
@@ -39,6 +40,7 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -145,10 +147,14 @@ class WLEngine : public ClockedObject
     void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
 
+    System* const system;
+    const RequestorID requestorId;
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     AddrRangeList getAddrRanges() const;
 
-    WLQueue wlReadQueue;
-    WLQueue wlWriteQueue;
+    WLQueue updateQueue;
+    WLQueue responseQueue;
     WLMemPort memPort;
 
     WLMemPort memPort;

From 394ffeb71c32901ae564babeadbcd5b6883fb5e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Feb 2022 21:15:35 -0800
Subject: [PATCH 012/287] Completing PushEngine.

---
 src/accl/push_engine.cc | 174 ++++++++++++++++++++++++++++++----------
 src/accl/push_engine.hh |  24 ++++--
 src/accl/util.cc        |  43 +++++++++-
 src/accl/util.hh        |   6 +-
 src/mem/packet.hh       |   2 +
 5 files changed, 196 insertions(+), 53 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bc3138f61e..cd5f73eea3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,26 +26,25 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "accl/util.hh"
 #include "accl/push_engine.hh"
-
 #include "debug/PushEngine.hh"
 
-PushEngine::PushEngine(const PushEngineParams& params):
-    ClockedObject(params),
+PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    vertexQueueSize(params.vertex_queue_size),
-    vertexQueueLen(0),
-    updateQueue(params.update_queue_size),
-    updateQueueLen(0),
-    nextReceiveEvent([this]{ processNextReceiveEvent(); }, name()),
-    nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextCreateEvent([this]{ processNextCreateEvent(); }, name()),
-    nextSendEvent([this]{ processNextSendEvent(); }, name())
-{}
+    // vertexQueueSize(params.vertex_queue_size),
+    // vertexQueueLen(0),
+    // updateQueue(params.update_queue_size),
+    // updateQueueLen(0),
+    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextSendEvent([this] { processNextSendEvent(); }, name())
+{
+}
 
 Port &
 PushEngine::getPort(const std::string &if_name, PortID idx)
@@ -61,60 +60,151 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-bool
-PushEngine::handleUpdate(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
 {
-    if (vertexQueueLen < vertexQueueSize) {
-        vertexQueue.push(pkt)
-        vertexQueueLen++;
-        return true;
+    owner->memPort->getAddrRanges();
+}
 
-        if (!nextReceiveEvent.scheduled()){
-            schedule(nextReceiveEvent, nextCycle());
-        }
+bool PushEngine::handleUpdate(PacketPtr pkt)
+{
+    // if (vertexQueueLen < vertexQueueSize) {
+    //     vertexQueue.push(pkt)
+    //         vertexQueueLen++;
+    //     if (!nextReceiveEvent.scheduled()) {
+    //         schedule(nextReceiveEvent, nextCycle());
+    //     }
+    //     return true;
+    // }
+    // return false;
+    vertexQueue.push(pkt)
+    if (!nextReceiveEvent.scheduled()) {
+        schedule(nextReceiveEvent, nextCycle());
     }
-    return false;
+    return true;
 }
 
-void
-PushEngine::processNextReceiveEvent()
+void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t* data = updatePkt->getData<uint8_t>();
-
-    Addr edgeListAddr = ; // TODO: Generalize finding this address.
-    int outDegree = ; // TODO: Generalize finding this value.
-
-    Addr reqAddr = (edgeListAddr / 64) * 64;
-    Addr offsetAddr = edgeListAddr % 64;
+    uint8_t *data = updatePkt->getData<uint8_t>();
+
+    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
+    uint32_t edge_index = *((uint32_t *)data);
+    uint32_t degree = *((uint32_t *)(data + 4));
+    uint32_t value = *((uint32_t *)(data + 8));
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < degree; index++) {
+        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push(req_addr);
+                offset_queue.push(req_offset);
+                num_edge_queue.push(1);
+            }
+        }
+        else {
+            addr_queue.push(req_addr);
+            offset_queue.push(req_offset);
+            num_edge_queue.push(1);
+        }
+    }
 
-    PacketPtr pkt = getReadPacket(reqAddr, 64, requestorId);
+    for (int index = 0; index < addr_queue.size(); inedx++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        memReqQueue.push(pkt);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = value;
+    }
 
-    memPort.sendPacket(pkt);
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
 
+void PushEngine::processNextReadEvent()
+{
+    PacketPtr pkt = memReqQueue.front();
+    if (!memPort.blocked()) {
+        memPort.sendPacket(pkt);
+        memReqQueue.pop();
+    }
 
+    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+        schedule(nextReadEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextReadEvent()
+bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
 {
+    return owner->handleMemResp(pkt);
+}
 
+void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
+                this->name());
+        _blocked = true;
+    }
 }
 
-void
-PushEngine::processNextCreateEvent()
+void PushEngine::handleMemResp(PacketPtr pkt)
 {
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        Edge e = memoryToEdge(curr_edge_data);
+        uint32_t *update_data = new uint32_t;
+
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+        updateQueue.push(update);
+    }
 
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
 }
 
-void
-PushEngine::processNextSendEvent()
+
+void PushEngine::processNextSendEvent()
 {
+    PacketPtr pkt = updateQueue.front();
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        updateQueue.pop();
+    }
 
-}
\ No newline at end of file
+    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
+        schedule(nextSendEvent, nextCycle());
+    }
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 6ab902d0e2..a746dcc265 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -51,6 +51,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        //TODO: Implement this;
         PushRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -65,6 +66,7 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushReqPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
@@ -78,9 +80,12 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        // TODO: Implement this;
         PushMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        bool sendPacket(PacktPtr pkt);
+
+        void sendPacket(PacktPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
@@ -93,12 +98,18 @@ class PushEngine : public ClockedObject
     PushMemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
-    int vertexQueueSize;
-    int vertexQueueLen;
+    // int vertexQueueSize;
+    // int vertexQueueLen;
+
+    std::unordered_map<req, Addr> reqOffsetMap;
+    std::unordered_map<req, int> reqNumEdgeMap;
+    std::unordered_map<req, uint32_t> reqValueMap;
+
+    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
     std::queue<PacketPtr> updateQueue;
-    int updateQueueSize;
-    int updateQueueLen;
+    // int updateQueueSize;
+    // int updateQueueLen;
 
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -106,9 +117,6 @@ class PushEngine : public ClockedObject
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextCreateEvent;
-    void processNextCreateEvent();
-
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 8debd3a937..76ed6269c2 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,34 @@
 
 #include "accl/util.hh"
 
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+Edge&
+memoryToEdge(uint8_t *data)
+{
+    uint64_t weight = *((uint64_t*) data);
+    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
+    Edge e = {weight, neighbor};
+    return e;
+}
+
+// Edge: (weight: 64 bits, neighbor: 64 bits)
+uint8_t*
+edgeToMemory(Edge e)
+{
+    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
+
+    uint8_t* data = new uint8_t [data_size];
+
+    uint64_t* weightPtr = (uint64_t*) data;
+    *weightPtr = e.weight;
+
+    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
+    *neighborPtr = e.neighbor;
+
+    return data;
+}
+
 PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
@@ -43,6 +71,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
+
 PacketPtr getWritePacket(Addr addr,
                unsigned int size,
                uint8_t* data,
@@ -53,6 +82,18 @@ PacketPtr getWritePacket(Addr addr,
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
+                                               requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr)requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
     pkt->allocate();
     pkt->setData(data);
 
@@ -84,4 +125,4 @@ workListToMemory(WorkListItem wl){
     *wList + 3 = wl.edgeIndex;
 
     return data;
-}
\ No newline at end of file
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 00ccb7ddd9..c309d4967a 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -39,13 +39,15 @@ struct WorkListItem
 
 struct Edge
 {
-    uint32_t weight;
+    uint64_t weight;
     Addr neighbor;
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
 unit8_t* workListToMemory(WorkListItem wl);
+
 Edge& memoryToEdge(uint8_t* data);
+uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size, uint8_t* data, RequestorID requestorId);
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 9238dbec00..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -148,6 +148,8 @@ class MemCmd
         HTMAbort,
         // Tlb shootdown
         TlbiExtSync,
+        // MPU Accelerator
+        UpdateWL,
         NUM_MEM_CMDS
     };
 

From a13dcdb4c82d5a6d75eede265f42364ddb13f01a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:20:19 -0800
Subject: [PATCH 013/287] arch: Accelerator

[wip] Adding Sconscript, debugging

Change-Id: I0cef6e8745ca8f58a17a01d71dfb090fe1a7e606
---
 src/accl/PushEngine.py  | 39 ++++++++++++++++++++++
 src/accl/SConscript     | 36 ++++++++++++++++++++
 src/accl/apply.cc       | 74 +++++++++++++++++++----------------------
 src/accl/apply.hh       | 24 +++++++++----
 src/accl/push_engine.cc |  2 +-
 src/accl/util.cc        |  2 ++
 src/accl/util.hh        |  7 ++--
 src/accl/wl_engine.cc   | 71 +++++++++++++++++++--------------------
 src/accl/wl_engine.hh   | 20 +++++++----
 9 files changed, 180 insertions(+), 95 deletions(-)
 create mode 100644 src/accl/PushEngine.py
 create mode 100644 src/accl/SConscript

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
new file mode 100644
index 0000000000..37639377c1
--- /dev/null
+++ b/src/accl/PushEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
+
+class PushEngine(ClockedObject):
+    type = 'PushEngine'
+    cxx_header = "accl/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    respPort = ResponsePort("Receives requests from WorkList")
+    reqPort  = RequestPort("Sends requests to Push")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/SConscript b/src/accl/SConscript
new file mode 100644
index 0000000000..da0774ca44
--- /dev/null
+++ b/src/accl/SConscript
@@ -0,0 +1,36 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('WLEngine.py')
+# SimObject('Apply.py')
+# SimObject('PushEngine.py')
+
+# Source('apply.cc')
+Source('wl_engine.cc')
+# Source('push_engine.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 6b474d5628..985e6217d7 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -38,11 +38,10 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name()),
-    queueSize(params.applyQueueSize) //add this to .py
+    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
 {
-    applyReadQueue(queueSize);
-    applyWriteQueue(queueSize);
+    applyReadQueue(params.applyQueueSize);
+    applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -110,7 +109,7 @@ Apply::ApplyMemPort::recvReqRetry()
 }
 
 void
-WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -119,7 +118,7 @@ WLEngine::ApplyReqPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyReqtPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -134,12 +133,13 @@ Apply::getAddrRanges() const
 
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
-    if (queue->blocked()){
+    if (queue.blocked()){
         sendPktRetry = true;
         return false;
-    } else
-        queue->push(pkt);
-    if(!nextApplyCheckEvent.scheduled()){
+    } else{
+        queue.push(pkt);
+    }
+    if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
@@ -147,22 +147,19 @@ bool Apply::handleWL(PacketPtr pkt){
 
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if(!memPort->blocked()){
+    if (!memPort.blocked()){
         auto pkt = queue.pop();
-        if(queue->sendPktRetry && !queue->blocked()){
-                respPort->trySendRetry();
-                queue->sendPktRetry = false;
+        if (queue.sendPktRetry && !queue.blocked()){
+                respPort.trySendRetry();
+                queue.sendPktRetry = false;
         }
         // conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        memPort->sendPacket(memPkt);
-    }
-    else{
-        break;
+        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+        requestOffset[request] = req_offset;
+        memPort.sendPacket(memPkt);
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -174,11 +171,11 @@ Apply::handleMemResp(PacktPtr pkt)
 {
     auto queue = applyWriteQueue;
 
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(pkt);
 
         if(!nextApplyEvent.scheduled()){
             schedule(nextApplyEvent, nextCycle());
@@ -193,41 +190,38 @@ Apply::processNextApplyEvent(){
         auto pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr req = pkt->req;
-        int request_offset = requestOffset[req];
+        RequestPtr request = pkt->req;
+        int request_offset = requestOffset[request];
         WorkListItem wl = memoryToWorkList(data + request_offset);
         uint32_t prop = wl.prop;
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort->blocked() && !reqPort->blocked()){
+            if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = min(prop , temp_prop);
+                wl.prop = std::min(prop , temp_prop);
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
                 //Create memory write requests.
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort->sendPacket(writePkt);
-                applyReqPort->sendPacket(writePkt);
+                memPort.sendPacket(writePkt);
+                applyReqPort.sendPacket(writePkt);
                 queue.pop();
-                if(queue->sendPktRetry && !queue->blocked()){
-                    memPort->trySendRetry();
-                    queue->sendPktRetry = false;
+                if (queue.sendPktRetry && !queue.blocked()){
+                    memPort.trySendRetry();
+                    queue.sendPktRetry = false;
                 }
             }
-            else
-                break;
-        }
-        else{
+        }else{
             queue.pop();
-            if(queue->sendPktRetry && !queue->blocked()){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (queue.sendPktRetry && !queue.blocked()){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
         }
     if(!queue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
-}
\ No newline at end of file
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index b213d37667..f4dabd6a97 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -35,10 +35,12 @@
 #include "accl/util.hh"
 #include "base/addr_range_map.hh"
 #include "base/statistics.hh"
-#include "mem/port.hh"
+#include "base/types.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "mem/port.hh"
+#include "params/Apply.hh"
 #include "sim/clocked_object.hh"
+#include "sim/port.hh"
 
 class Apply : public ClockedObject
 {
@@ -46,17 +48,25 @@ class Apply : public ClockedObject
 
     struct ApplyQueue{
       std::queue<PacketPtr> applyQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return applyQueue.size() == queueSize;
+        return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue.empty();
+        return applyQueue->empty();
       }
       void push(PacketPtr pkt){
-        applyQueue.push(pkt);
+        applyQueue->push(pkt);
+      }
+
+      void pop(){
+        applyQueue->pop();
+      }
+
+      void front(){
+        applyQueue->front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -167,4 +177,4 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_APPLY_HH__
\ No newline at end of file
+#endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index cd5f73eea3..c02009d25a 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-#include "debug/PushEngine.hh"
+// #include "debug/PushEngine.hh"
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 76ed6269c2..92f6a3e351 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,6 +28,8 @@
 
 #include "accl/util.hh"
 
+#include "base/types.hh"
+#include "mem/packet.hh"
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
diff --git a/src/accl/util.hh b/src/accl/util.hh
index c309d4967a..737d52e2a1 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -35,7 +36,7 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
-}
+};
 
 struct Edge
 {
@@ -44,10 +45,10 @@ struct Edge
 }
 
 WorkListItem& memoryToWorkList(uint8_t* data);
-unit8_t* workListToMemory(WorkListItem wl);
+uint8_t* workListToMemory(WorkListItem wl);
 
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
\ No newline at end of file
+PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7d6d707ae6..757bdd2598 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -39,11 +39,10 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name()),
-    queueSize(params.wlQueueSize) //add this to .py
+    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(queueSize);
-    responseQueue(queueSize);
+    updateQueue(params.wlQueueSize);
+    responseQueue(params.wlQueueSize);
 }
 
 Port &
@@ -138,11 +137,11 @@ WLEngine::getAddrRanges() const
 
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
-    if (queue->blocked()){
-        queue->sendPktRetry = true;
+    if (queue.blocked()){
+        queue.sendPktRetry = true;
         return false;
     } else
-        queue->push(pkt);
+        queue.push(pkt);
 
     if(!nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -152,19 +151,19 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    memPort = WLMemPort
-    while(!queue.empty()){ //create a map instead of front
+    auto memPort = WLMemPort;
+    while (!queue.empty()){ //create a map instead of front
         auto pkt = queue.front()
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr req =
+        RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
-        requestOffset[req] = req_offset;
-        if (!memPort->blocked()){
+        requestOffset[request] = req_offset;
+        if (!memPort.blocked()){
             queue.pop()
-            memPort->sendPacket(memPkt);
+            memPort.sendPacket(memPkt);
             break;
         }
     }
@@ -177,11 +176,11 @@ bool
 WLEngine::handleMemResp(PacktPtr pkt)
 {
     auto queue = responseQueue;
-        if (queue->blocked()){
+        if (queue.blocked()){
             sendPktRetry = true;
             return false;
         } else
-            queue->push(writePkt);
+            queue.push(writePkt);
 
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
@@ -199,49 +198,47 @@ WLEngine::processNextWLReduceEvent(){
     auto value = update->getPtr<uint8_t>();
     auto pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr req = pkt->req;
-    int request_offset = requestOffset[req];
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
     WorkListItem wl =  memoryToWorkList(data + request_offset)
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = min(value , temp_prop);
-        if (!memPort->blocked() && !applyPort->blocked()){
+        temp_prop = std::min(value , temp_prop);
+        if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
-            unit8_t* wlItem = workListToMemory(wl);
+            uint8_t* wlItem = workListToMemory(wl);
             memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort->sendPacket(writePkt);
-            applyPort->sendPacket(writePkt);
+            memPort.sendPacket(writePkt);
+            applyPort.sendPacket(writePkt);
             queue.pop();
-            if (!queue->blocked() && queue->sendPktRetry){
-                memPort->trySendRetry();
-                queue->sendPktRetry = false;
+            if (!queue.blocked() && queue.sendPktRetry){
+                memPort.trySendRetry();
+                queue.sendPktRetry = false;
             }
             updateQ.pop();
-            if (!updateQ->blocked() & updateQ->sendPktRetry){
-                respPort->trySendRetry();
-                updateQ->sendPktRetry = false;
+            if (!updateQ.blocked() & updateQ.sendPktRetry){
+                respPort.trySendRetry();
+                updateQ.sendPktRetry = false;
             }
         }
-        else
-            break;
     }
     else{
         queue.pop();
-        if (!queue->blocked() && queue->sendPktRetry){
-            memPort->trySendRetry();
-            queue->sendPktRetry = false;
+        if (!queue.blocked() && queue.sendPktRetry){
+            memPort.trySendRetry();
+            queue.sendPktRetry = false;
         }
         updateQ.pop()
-        if (!updateQ->blocked() & updateQ->sendPktRetry){
-            respPort->trySendRetry();
-            updateQ->sendPktRetry = false;
+        if (!updateQ.blocked() & updateQ.sendPktRetry){
+            respPort.trySendRetry();
+            updateQ.sendPktRetry = false;
         }
 
     }
-    if(!queue && !nextWLReduceEvent.scheduled()){
+    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 7132283463..0393cd4cb5 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -37,9 +37,9 @@
 #include "base/statistics.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/MPU.hh"
+#include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
-
+#include "sim/port.hh"
 
 class WLEngine : public ClockedObject
 {
@@ -47,20 +47,26 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint_32 queueSize;
+      const uint32_t queueSize;
       bool sendPktRetry;
 
       bool blocked(){
-        return wlQueue.size() == queueSize;
+        return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue.empty();
+        return wlQueue->empty();
       }
       void push(PacketPtr pkt){
-        wlQueue.push(pkt);
+        wlQueue->push(pkt);
+      }
+      void pop(){
+        wlQueue->pop();
+      }
+      void front(){
+        wlQueue.front());
       }
 
-      WLReqPort(uint32_t qSize):
+      WLQueue(uint32_t qSize):
         queueSize(qSize){}
     };
 

From d65b96c0ab6fdd6763a6d940b8bcc8759153930e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 10:02:36 -0800
Subject: [PATCH 014/287] Addin simobject file and startup for PushEngine.

---
 src/accl/PushEngine.py  | 11 ++++++-----
 src/accl/push_engine.cc | 37 ++++++++++++++++++++++++++++++++++++-
 src/accl/push_engine.hh |  3 +++
 3 files changed, 45 insertions(+), 6 deletions(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 37639377c1..3215fdaee2 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -26,14 +26,15 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'PushEngine'
+    type = 'WLEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
+    system = Param.System(Parent.any, "The system object this push engine is a part of")
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index c02009d25a..f1f8f7698b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -60,6 +60,40 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, requestorId);
+        memPort.sendFunctional(pkt);
+    }
+
+}
+
 bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
@@ -104,7 +138,8 @@ void PushEngine::processNextReceiveEvent()
     std::vector<int> num_edge_queue;
 
     for (uint32_t index = 0; index < degree; index++) {
-        Addr edge_addr = (edge_index + index) * sizeof(Edge);
+        // FIXME: For now the base edge address is 1048576
+        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index a746dcc265..077c61aa2b 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,6 +39,7 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
+//FIXME: Add gem5 namespace here
 class PushEngine : public ClockedObject
 {
   private:
@@ -89,6 +90,8 @@ class PushEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
+    virtual void startup() override;
+
     System* const system;
     const RequestorID requestorId;
 

From fb64f7d3e1c82b7a71b70a14215f8663c8908d65 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 12:26:30 -0800
Subject: [PATCH 015/287] Bug fixes.

---
 src/accl/SConscript |  8 ++---
 src/accl/util.cc    | 82 +++++++++++++++++++++++++--------------------
 src/accl/util.hh    |  7 ++--
 3 files changed, 55 insertions(+), 42 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index da0774ca44..4b78ff9e80 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -28,9 +28,9 @@
 Import('*')
 
 SimObject('WLEngine.py')
-# SimObject('Apply.py')
-# SimObject('PushEngine.py')
+SimObject('Apply.py')
+SimObject('PushEngine.py')
 
-# Source('apply.cc')
+Source('apply.cc')
 Source('wl_engine.cc')
-# Source('push_engine.cc')
+Source('push_engine.cc')
diff --git a/src/accl/util.cc b/src/accl/util.cc
index 92f6a3e351..b81ba4db7d 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,8 +28,39 @@
 
 #include "accl/util.hh"
 
-#include "base/types.hh"
-#include "mem/packet.hh"
+WorkListItem&
+memoryToWorkList(uint8_t* data){
+    WorkListItem wl;
+    uint32_t temp_prop = *((uint32_t*) data));
+
+    uint32_t prop = *((uint32_t*) (data + 4));
+
+    uint32_t degree = *((uint32_t*) (data + 8));
+
+    uint32_t addr = *((uint32_t*) (data + 12));
+
+    retrun wl  = {temp_prop, prop, degree, addr};
+}
+
+uint8_t*
+workListToMemory(WorkListItem wl){
+    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
+    uint8_t* data = new uint8_t [data_size];
+
+    uint32_t* tempPtr = (uint32_t*) data;
+    *tempPtr = wl.temp_prop;
+
+    uint32_t* propPtr = (uint32_t*) (data + 4);
+    *propPtr = wl.prop;
+
+    uint32_t* degreePtr = (uint32_t*) (data + 8);
+    *degreePtr = wl.degree;
+
+    uint32_t* edgePtr = (uint32_t*) (data + 12);
+    *edgePtr = wl.edgeIndex;
+
+    return data;
+}
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
 Edge&
@@ -58,7 +89,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr
+PacketPtr&
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -73,19 +104,24 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-
-PacketPtr getWritePacket(Addr addr,
-               unsigned int size,
-               uint8_t* data,
-               RequestorID requestorId)
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+            uint8_t* data, RequestorID requestorId)
 {
-    equestPtr req = std::make_shared<Request>(addr, size, 0,
+    RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
     req->setPC(((Addr)requestorId) << 2);
 
     PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
 
-PacketPtr
+PacketPtr&
 getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
@@ -102,29 +138,3 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
-WorkListItem&
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
-
-    uint32_t prop = *((uint32_t*) (data + 4));
-
-    uint32_t degree = *((uint32_t*) (data + 8));
-
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    retrun wl  = {temp_prop, prop, degree, addr};
-}
-
-unit8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem)/sizeof(uint_8)
-    uint_8* data = new uint8_t [data_size];
-    uint_32* wList = (uint_32*)data;
-    *wList = wl.prop;
-    *wList + 1 = wl.temp_prop;
-    *wList + 2 = wl.degree;
-    *wList + 3 = wl.edgeIndex;
-
-    return data;
-}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 737d52e2a1..da5a0736c9 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,7 +26,6 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "base/addr_range_map.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 
@@ -50,5 +49,9 @@ uint8_t* workListToMemory(WorkListItem wl);
 Edge& memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size, RequestorID requestorId);
+PacketPtr& getReadPacket(Addr addr, unsigned int size,
+                            RequestorID requestorId);
+PacketPtr&
+getWritePacket(Addr addr, unsigned int size,
+                uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);

From 9eeb01889c5813d1f60ddfacda5e4c4538460860 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 13:05:49 -0800
Subject: [PATCH 016/287] More bug fixes.

---
 src/accl/SConscript     |  5 +++--
 src/accl/apply.cc       |  5 +++++
 src/accl/apply.hh       |  5 +++++
 src/accl/push_engine.cc |  6 +++++-
 src/accl/push_engine.hh |  5 ++++-
 src/accl/util.hh        |  3 +--
 src/accl/wl_engine.cc   |  4 ++++
 src/accl/wl_engine.hh   | 11 ++++++++---
 8 files changed, 35 insertions(+), 9 deletions(-)

diff --git a/src/accl/SConscript b/src/accl/SConscript
index 4b78ff9e80..18ac71eb7d 100644
--- a/src/accl/SConscript
+++ b/src/accl/SConscript
@@ -27,10 +27,11 @@
 
 Import('*')
 
-SimObject('WLEngine.py')
 SimObject('Apply.py')
 SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('wl_engine.cc')
 Source('push_engine.cc')
+Source('wl_engine.cc')
+Source('util.cc')
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 985e6217d7..678f240bf6 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,9 @@
 
 #include <string>
 
+namespace gem5
+{
+
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
@@ -225,3 +228,5 @@ Apply::processNextApplyEvent(){
         schedule(nextApplyEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f4dabd6a97..42cb310136 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -42,6 +42,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class Apply : public ClockedObject
 {
   private:
@@ -177,4 +180,6 @@ class Apply : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
+}
+
 #endif // __ACCL_APPLY_HH__
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index f1f8f7698b..57fa560ff7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -28,7 +28,9 @@
 
 #include "accl/util.hh"
 #include "accl/push_engine.hh"
-// #include "debug/PushEngine.hh"
+
+namespace gem5
+{
 
 PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
     system(params.system),
@@ -243,3 +245,5 @@ void PushEngine::processNextSendEvent()
         schedule(nextSendEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 077c61aa2b..cc129076a5 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -39,7 +39,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
-//FIXME: Add gem5 namespace here
+namespace gem5
+{
+
 class PushEngine : public ClockedObject
 {
   private:
@@ -134,4 +136,5 @@ class PushEngine : public ClockedObject
 
 };
 
+}
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/util.hh b/src/accl/util.hh
index da5a0736c9..76d67ce6df 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -51,7 +51,6 @@ uint8_t* edgeToMemory(Edge e);
 
 PacketPtr& getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr&
-getWritePacket(Addr addr, unsigned int size,
+PacketPtr& getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
 PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 757bdd2598..00371e56cc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+namespace gem5
+{
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
@@ -242,3 +244,5 @@ WLEngine::processNextWLReduceEvent(){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
+
+}
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 0393cd4cb5..8c69bba7f7 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -41,6 +41,9 @@
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
+namespace gem5
+{
+
 class WLEngine : public ClockedObject
 {
   private:
@@ -117,7 +120,7 @@ class WLEngine : public ClockedObject
       public:
         WLMemPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         void trySendRetry();
         bool blocked(){
           return _blocked;
@@ -132,7 +135,7 @@ class WLEngine : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
 
 
     //Events
@@ -174,4 +177,6 @@ class WLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 };
 
-#endif // __ACCL_WLE_HH__
\ No newline at end of file
+}
+
+#endif // __ACCL_WLE_HH__

From 6efe411a7a16cca5b80ce4fdecba591c1f9de67a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 14:18:14 -0800
Subject: [PATCH 017/287] Even more bug fixes.

---
 src/accl/push_engine.cc | 28 +++++++++++++++++++++++-----
 src/accl/push_engine.hh | 35 +++++++++++++++++++++--------------
 src/accl/util.cc        | 24 ++++++++++++++----------
 src/accl/util.hh        | 18 ++++++++++++------
 4 files changed, 70 insertions(+), 35 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 57fa560ff7..56a57e76ac 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -65,6 +65,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
     WorkListItem vertices [5] = {
                                 {0, 0, 3, 0}, // Addr: 0
                                 {0, 0, 1, 3}, // Addr: 16
@@ -109,6 +111,7 @@ PushEngine::PushRespPort::getAddrRanges()
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
 {
+    //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
     //     vertexQueue.push(pkt)
     //         vertexQueueLen++;
@@ -192,20 +195,19 @@ bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-        DPRINTF(MemScheduler, "Setting blocked to true on port %s\n",
-                this->name());
         _blocked = true;
     }
 }
 
-void PushEngine::handleMemResp(PacketPtr pkt)
+bool PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -230,8 +232,12 @@ void PushEngine::handleMemResp(PacketPtr pkt)
     if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextSendEvent, nextCycle());
     }
-}
 
+    //TODO: Should we always return true? It's the response from the memory
+    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
+    // higher than its demand bandwidth
+    return true;
+}
 
 void PushEngine::processNextSendEvent()
 {
@@ -246,4 +252,16 @@ void PushEngine::processNextSendEvent()
     }
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index cc129076a5..7b5f483431 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,10 +54,10 @@ class PushEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        //TODO: Implement this;
-        PushRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushRespPort(const std::string& name, PushEngine* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     }
@@ -65,27 +65,32 @@ class PushEngine : public ClockedObject
     class PushReqPort : public RequestPort
     {
       private:
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        PushReqPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
     }
 
     class PushMemPort : public RequestPort
     {
       private:
+        PushEngine* owner
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        // TODO: Implement this;
-        PushMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        PushMemPort(const std::string& name, PushEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
         void sendPacket(PacktPtr pkt);
         bool blocked() { return _blocked; }
@@ -106,9 +111,9 @@ class PushEngine : public ClockedObject
     // int vertexQueueSize;
     // int vertexQueueLen;
 
-    std::unordered_map<req, Addr> reqOffsetMap;
-    std::unordered_map<req, int> reqNumEdgeMap;
-    std::unordered_map<req, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     std::queue<PacketPtr> memReqQueue; // Infinite queueing?
 
@@ -127,6 +132,8 @@ class PushEngine : public ClockedObject
 
     bool handleUpdate(PacketPtr pkt);
 
+    bool handleMemResp(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/util.cc b/src/accl/util.cc
index b81ba4db7d..40a1fc761b 100644
--- a/src/accl/util.cc
+++ b/src/accl/util.cc
@@ -28,18 +28,20 @@
 
 #include "accl/util.hh"
 
-WorkListItem&
+namespace gem5
+{
+
+WorkListItem
 memoryToWorkList(uint8_t* data){
     WorkListItem wl;
-    uint32_t temp_prop = *((uint32_t*) data));
 
+    uint32_t temp_prop = *((uint32_t*) data);
     uint32_t prop = *((uint32_t*) (data + 4));
-
     uint32_t degree = *((uint32_t*) (data + 8));
-
     uint32_t addr = *((uint32_t*) (data + 12));
 
-    retrun wl  = {temp_prop, prop, degree, addr};
+    wl  = {temp_prop, prop, degree, addr};
+    return wl;
 }
 
 uint8_t*
@@ -63,7 +65,7 @@ workListToMemory(WorkListItem wl){
 }
 
 // Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge&
+Edge
 memoryToEdge(uint8_t *data)
 {
     uint64_t weight = *((uint64_t*) data);
@@ -89,7 +91,7 @@ edgeToMemory(Edge e)
     return data;
 }
 
-PacketPtr&
+PacketPtr
 getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
@@ -104,7 +106,7 @@ getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
     return pkt;
 }
 
-PacketPtr&
+PacketPtr
 getWritePacket(Addr addr, unsigned int size,
             uint8_t* data, RequestorID requestorId)
 {
@@ -121,8 +123,9 @@ getWritePacket(Addr addr, unsigned int size,
     return pkt;
 }
 
-PacketPtr&
-getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PacketPtr
+getUpdatePacket(Addr addr, unsigned int size,
+            uint8_t *data, RequestorID requestorId)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0,
                                                requestorId);
@@ -138,3 +141,4 @@ getUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
     return pkt;
 }
 
+}
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 76d67ce6df..91692488a4 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -28,6 +28,10 @@
 
 #include "base/types.hh"
 #include "mem/packet.hh"
+#include "mem/request.hh"
+
+namespace gem5
+{
 
 struct WorkListItem
 {
@@ -41,16 +45,18 @@ struct Edge
 {
     uint64_t weight;
     Addr neighbor;
-}
+};
 
-WorkListItem& memoryToWorkList(uint8_t* data);
+WorkListItem memoryToWorkList(uint8_t* data);
 uint8_t* workListToMemory(WorkListItem wl);
 
-Edge& memoryToEdge(uint8_t* data);
+Edge memoryToEdge(uint8_t* data);
 uint8_t* edgeToMemory(Edge e);
 
-PacketPtr& getReadPacket(Addr addr, unsigned int size,
+PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
-PacketPtr& getWritePacket(Addr addr, unsigned int size,
+PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr& getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
+}

From fcdcceb33d9d2dc054f8ad021c0e39c8e4bff21e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 15:46:21 -0800
Subject: [PATCH 018/287] Bug fixes, bug fixes everywhere.

---
 src/accl/apply.cc       | 12 ++++----
 src/accl/apply.hh       | 61 ++++++++++++++++++++---------------------
 src/accl/push_engine.cc |  8 +++++-
 src/accl/push_engine.hh | 17 ++++++------
 src/accl/wl_engine.hh   | 17 +++++-------
 5 files changed, 60 insertions(+), 55 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 678f240bf6..c44738d3fa 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -62,14 +62,14 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
+Apply::ApplyRespPort::getAddrRanges()
 {
     return owner->getAddrRanges();
 }
 
 bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWL(pkt)){
+    if (!owner->handleWL(pkt)){
         return false;
     }
     return true;
@@ -82,15 +82,17 @@ Apply::ApplyRespPort::trySendRetry()
 }
 
 
-virtual bool
+bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
-WLEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 42cb310136..788550646a 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -33,14 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
-#include "base/types.hh"
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/Apply.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -58,18 +57,18 @@ class Apply : public ClockedObject
         return (applyQueue.size() == queueSize);
       }
       bool empty(){
-        return applyQueue->empty();
+        return applyQueue.empty();
       }
       void push(PacketPtr pkt){
-        applyQueue->push(pkt);
+        applyQueue.push(pkt);
       }
 
       void pop(){
-        applyQueue->pop();
+        applyQueue.pop();
       }
 
       void front(){
-        applyQueue->front();
+        applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -80,16 +79,17 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
+        bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyRespPort(const std::string& name, Apply* owner):
+          ResponsePort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
 
-        virtual AddrRangeList getAddrRanges();
         void trySendRetry();
-
-      protected:
+        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
     };
 
@@ -101,12 +101,13 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyReqPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -121,13 +122,14 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        ApplyReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        ApplyMemPort(const std::string& name, Apply* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
         void sendPacket(PacketPtr pkt);
         void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked(){ return _blocked;}
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -138,28 +140,24 @@ class Apply : public ClockedObject
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readApplyBuffer();
-    bool handleMemResp(PacktPtr resp);
+    bool handleMemResp(PacketPtr resp);
     void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
+    EventFunctionWrapper nextApplyCheckEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-
     System* const system;
     const RequestorID requestorId;
 
@@ -170,13 +168,14 @@ class Apply : public ClockedObject
 
     ApplyMemPort memPort;
     ApplyRespPort respPort;
-    ApplyRequestPort reqPort;
+    ApplyReqPort reqPort;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
     Apply(const ApplyParams &apply);
-    Port &getPort(const std::string &if_name,
+
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 56a57e76ac..48f1115042 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->memPort->getAddrRanges();
+    owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -264,4 +264,10 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 7b5f483431..d478d14df0 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -31,8 +31,7 @@
 
 #include <queue>
 
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/PushEngine.hh"
@@ -60,7 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
-    }
+    };
 
     class PushReqPort : public RequestPort
     {
@@ -77,12 +76,12 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     class PushMemPort : public RequestPort
     {
       private:
-        PushEngine* owner
+        PushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
@@ -92,10 +91,10 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
-        void sendPacket(PacktPtr pkt);
+        void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
-    }
+    };
 
     virtual void startup() override;
 
@@ -134,11 +133,13 @@ class PushEngine : public ClockedObject
 
     bool handleMemResp(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges();
+
   public:
 
     PushEngine(const PushEngineParams &params);
 
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
 };
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8c69bba7f7..6f875adfed 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -33,13 +33,13 @@
 #include <unordered_map>
 
 #include "accl/util.hh"
-#include "base/addr_range_map.hh"
-#include "base/statistics.hh"
+#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/WLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
@@ -140,40 +140,37 @@ class WLEngine : public ClockedObject
 
     //Events
     void processNextWLReadEvent();
+    EventFunctionWrapper nextWLReadEvent;
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
     void processNextWLReduceEvent();
+    EventFunctionWrapper nextWLReduceEvent;
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
-    void processNextWLReadEvent();
-    EventFunctionWrapper nextWLReadEvent;
-
-    void processNextWLReduceEvent();
-    EventFunctionWrapper nextWLReduceEvent;
 
     System* const system;
     const RequestorID requestorId;
+
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
 
     WLQueue updateQueue;
     WLQueue responseQueue;
-    WLMemPort memPort;
 
     WLMemPort memPort;
     WLRespPort respPort;
-    WLRequestPort reqPort;
+    WLReqPort reqPort;
 
    public:
 
     WLEngine(const WLEngineParams &params);
-    Port &getPort(const std::string &if_name,
+    Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
 

From 750510f593e59e776bbfb2906a8b8e082669aa36 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:33:53 -0800
Subject: [PATCH 019/287] arch: Debugging worklist engine

[wip] Adding some missing virtual functions.

Change-Id: I26f6c7d789f4b295bac3bc9b2a80f2cadb45b96f
---
 src/accl/wl_engine.cc | 26 +++++++++++++++++++++++++-
 src/accl/wl_engine.hh |  4 ++++
 2 files changed, 29 insertions(+), 1 deletion(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 00371e56cc..7515e10167 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -81,6 +81,24 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
+virtual void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+virtual Tick
+WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+virtual void
+WLEngine::WLRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -137,6 +155,12 @@ WLEngine::getAddrRanges() const
     return memPort.getAddrRanges();
 }
 
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.recvFunctional(pkt);
+}
+
 bool WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
@@ -164,7 +188,7 @@ void WLEngine::processNextWLReadEvent(){
         PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
-            queue.pop()
+            queue.pop();
             memPort.sendPacket(memPkt);
             break;
         }
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 6f875adfed..d2b96db203 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,6 +88,9 @@ class WLEngine : public ClockedObject
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class WLReqPort : public RequestPort //To Apply Engine
@@ -159,6 +162,7 @@ class WLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     AddrRangeList getAddrRanges() const;
+    void recvFunctional(PacketPtr pkt);
 
     WLQueue updateQueue;
     WLQueue responseQueue;

From 79429d177df5baef0d3cd4fc33a4db249d66db37 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:38:05 -0800
Subject: [PATCH 020/287] Bug fix.

---
 src/accl/Apply.py       |  1 +
 src/accl/apply.cc       |  6 ++---
 src/accl/push_engine.cc | 50 ++++++++++++++++++++++++++++++++---------
 src/accl/push_engine.hh |  3 +++
 4 files changed, 46 insertions(+), 14 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 01c627d4c8..58639e880a 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -34,6 +34,7 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
+    system = Param.System(Parent.any, "The system object this apply engine is a part of")
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index c44738d3fa..70bc8031c9 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -40,8 +40,8 @@ Apply::Apply(const ApplyParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{processNextApplyEvent; }, name()),
-    nextApplyCheckEvent([this]{processNextApplyCheckEvent; }, name())
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {
     applyReadQueue(params.applyQueueSize);
     applyWriteQueue(params.applyQueueSize);
@@ -172,7 +172,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacktPtr pkt)
+Apply::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 48f1115042..6ebe34ebd3 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -106,7 +106,7 @@ bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 AddrRangeList
 PushEngine::PushRespPort::getAddrRanges()
 {
-    owner->getAddrRanges();
+    return owner->getAddrRanges();
 }
 
 bool PushEngine::handleUpdate(PacketPtr pkt)
@@ -121,7 +121,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt)
+    vertexQueue.push(pkt);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -130,8 +130,8 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 
 void PushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.pop();
-    uint8_t *data = updatePkt->getData<uint8_t>();
+    PacketPtr updatePkt = vertexQueue.front();
+    uint8_t *data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
@@ -152,19 +152,19 @@ void PushEngine::processNextReceiveEvent()
                 num_edge_queue.back()++;
             }
             else {
-                addr_queue.push(req_addr);
-                offset_queue.push(req_offset);
-                num_edge_queue.push(1);
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
             }
         }
         else {
-            addr_queue.push(req_addr);
-            offset_queue.push(req_offset);
-            num_edge_queue.push(1);
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
         }
     }
 
-    for (int index = 0; index < addr_queue.size(); inedx++) {
+    for (int index = 0; index < addr_queue.size(); index++) {
         PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
@@ -172,6 +172,8 @@ void PushEngine::processNextReceiveEvent()
         reqValueMap[pkt->req] = value;
     }
 
+    vertexQueue.pop();
+
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -264,10 +266,36 @@ PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
     }
 }
 
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
     return memPort.getAddrRanges();
 }
 
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index d478d14df0..0acedd0da8 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -59,6 +59,7 @@ class PushEngine : public ClockedObject
         {}
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +77,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     class PushMemPort : public RequestPort
@@ -94,6 +96,7 @@ class PushEngine : public ClockedObject
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
         virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
     };
 
     virtual void startup() override;

From 228fcf05f87be11a23ee5cfb8dec41d5b8dbcedd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:46:27 -0800
Subject: [PATCH 021/287] Bug fix.

---
 src/accl/Apply.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index 58639e880a..d6a4bbe5a9 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class Apply(ClockedObject):

From 709a21552623e2f112730512a1652d0436ccce03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Feb 2022 21:47:36 -0800
Subject: [PATCH 022/287] Fixing a bug-fix.

---
 src/accl/apply.hh | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 788550646a..e1b6d33359 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -88,6 +88,7 @@ class Apply : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
 
+      protected:
         void trySendRetry();
         virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);

From c1dd68a3e06a498b89cbb043f4779865ecad91b3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 00:13:21 -0800
Subject: [PATCH 023/287] fixing some bugs

---
 src/accl/Apply.py     |  1 +
 src/accl/WLEngine.py  |  4 +++-
 src/accl/apply.cc     | 31 ++++++++++++++++++++----
 src/accl/apply.hh     | 23 ++++++++++--------
 src/accl/wl_engine.cc | 48 +++++++++++++++++++------------------
 src/accl/wl_engine.hh | 55 ++++++++++++++++++++++++-------------------
 6 files changed, 99 insertions(+), 63 deletions(-)

diff --git a/src/accl/Apply.py b/src/accl/Apply.py
index d6a4bbe5a9..8720287cc8 100644
--- a/src/accl/Apply.py
+++ b/src/accl/Apply.py
@@ -38,3 +38,4 @@ class Apply(ClockedObject):
     respPort = ResponsePort("Receives requests from WorkList")
     reqPort  = RequestPort("Sends requests to Push")
     memPort  = RequestPort("Memory side port, sends requests")
+    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/WLEngine.py b/src/accl/WLEngine.py
index fe6b25b6ba..562fd04423 100644
--- a/src/accl/WLEngine.py
+++ b/src/accl/WLEngine.py
@@ -26,7 +26,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 from m5.params import *
-from m5.SimObject import SimObject
+from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
 class WLEngine(ClockedObject):
@@ -34,6 +34,8 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
     respPort = ResponsePort("Receives updates")
     reqPort  = RequestPort("Sends requests to Apply")
     memPort  = RequestPort("Memory side port, sends requests")
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 70bc8031c9..410eff5268 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -81,6 +81,23 @@ Apply::ApplyRespPort::trySendRetry()
     sendRetryReq();
 }
 
+void
+Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
+{
+    panic("Not implemented");
+}
+
+Tick
+Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+Apply::ApplyRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
 
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
@@ -139,7 +156,7 @@ Apply::getAddrRanges() const
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
-        sendPktRetry = true;
+        queue.sendPktRetry = true;
         return false;
     } else{
         queue.push(pkt);
@@ -177,7 +194,7 @@ Apply::handleMemResp(PacketPtr pkt)
     auto queue = applyWriteQueue;
 
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
         } else
             queue.push(pkt);
@@ -192,7 +209,7 @@ Apply::handleMemResp(PacketPtr pkt)
 void
 Apply::processNextApplyEvent(){
     auto queue = applyWriteQueue;
-        auto pkt = queue.front();
+        PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
 
         RequestPtr request = pkt->req;
@@ -204,7 +221,11 @@ Apply::processNextApplyEvent(){
         if (temp_prop != prop){
             if (!memPort.blocked() && !reqPort.blocked()){
                 //update prop with temp_prop
-                wl.prop = std::min(prop , temp_prop);
+                if(prop < temp_prop){
+                    wl.prop = prop;
+                }else{
+                    wl.prop = temp_prop;
+                }
                 //write back the new worklist item to  memory
                 uint8_t* wList = workListToMemory(wl);
                 memcpy(data + request_offset, wList, sizeof(WorkListItem));
@@ -212,7 +233,7 @@ Apply::processNextApplyEvent(){
                 PacketPtr writePkt  =
                 getWritePacket(pkt->getAddr(), 64, data, requestorId);
                 memPort.sendPacket(writePkt);
-                applyReqPort.sendPacket(writePkt);
+                reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
                     memPort.trySendRetry();
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index e1b6d33359..f08c1fef85 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,12 +63,12 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      void pop(){
-        applyQueue.pop();
+      PacketPtr pop(){
+        return applyQueue->pop();
       }
 
-      void front(){
-        applyQueue.front();
+      PacketPtr front(){
+        return applyQueue.front();
       }
 
       ApplyQueue(uint32_t qSize):
@@ -83,15 +83,18 @@ class Apply : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
+        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
       protected:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class ApplyReqPort : public RequestPort
@@ -137,6 +140,10 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    ApplyMemPort memPort;
+    ApplyRespPort respPort;
+    ApplyReqPort reqPort;
+
     bool handleWL(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
@@ -167,10 +174,6 @@ class Apply : public ClockedObject
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
   public:
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 7515e10167..9b16a15575 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -36,6 +36,7 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
+    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
@@ -43,8 +44,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
     nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
 {
-    updateQueue(params.wlQueueSize);
-    responseQueue(params.wlQueueSize);
+    updateQueue.resize(queueSize);
+    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -69,7 +70,7 @@ WLEngine::WLRespPort::getAddrRanges() const
 
 bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
 {
-    if (!this->handleWLUpdate(pkt)){
+    if (!owner->handleWLUpdate(pkt)){
         return false;
     }
     return true;
@@ -81,19 +82,19 @@ WLEngine::WLRespPort::trySendRetry()
     sendRetryReq();
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
     owner->recvFunctional(pkt);
 }
 
-virtual Tick
+Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
     panic("recvAtomic unimpl.");
 }
 
-virtual void
+void
 WLEngine::WLRespPort::recvRespRetry()
 {
     panic("recvRespRetry from response port is called.");
@@ -118,10 +119,10 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-virtual bool
+bool
 WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    return this->handleMemResp(pkt);
+    return owner->handleMemResp(pkt);
 }
 
 void
@@ -177,15 +178,14 @@ bool WLEngine::handleWLUpdate(PacketPtr pkt){
 
 void WLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    auto memPort = WLMemPort;
     while (!queue.empty()){ //create a map instead of front
-        auto pkt = queue.front()
+        PacketPtr pkt = queue.front();
         /// conver to ReadReq
         Addr req_addr = (pkt->getAddr() / 64) * 64;
         int req_offset = (pkt->getAddr()) % 64;
         RequestPtr request =
             std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(req, MemCmd::ReadReq);
+        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
         if (!memPort.blocked()){
             queue.pop();
@@ -199,15 +199,15 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacktPtr pkt)
+WLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
-            sendPktRetry = true;
+            queue.sendPktRetry = true;
             return false;
-        } else
-            queue.push(writePkt);
-
+        } else{
+            queue.push(pkt);
+        }
         if(!nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
         }
@@ -219,18 +219,20 @@ void
 WLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    applyPort = reqPort;
-    auto update = updateQ.front();
-    auto value = update->getPtr<uint8_t>();
-    auto pkt = queue.front();
+    auto applyPort = reqPort;
+    PacketPtr update = updateQ.front();
+    uint8_t* value = update->getPtr<uint8_t>();
+    PacketPtr pkt = queue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl =  memoryToWorkList(data + request_offset)
+    WorkListItem wl =  memoryToWorkList(data + request_offset);
     uint32_t temp_prop = wl.temp_prop;
     if (temp_prop != *value){
         //update prop with temp_prop
-        temp_prop = std::min(value , temp_prop);
+        if(*value < temp_prop){
+            temp_prop = *value;
+        }
         if (!memPort.blocked() && !applyPort.blocked()){
             wl.temp_prop = temp_prop;
             uint8_t* wlItem = workListToMemory(wl);
@@ -257,7 +259,7 @@ WLEngine::processNextWLReduceEvent(){
             memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
-        updateQ.pop()
+        updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
             respPort.trySendRetry();
             updateQ.sendPktRetry = false;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index d2b96db203..8d02c16981 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -50,27 +50,32 @@ class WLEngine : public ClockedObject
 
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
-      const uint32_t queueSize;
+      uint32_t queueSize;
       bool sendPktRetry;
 
+      void resize(uint32_t size){
+        queueSize = size;
+      }
+
       bool blocked(){
         return (wlQueue.size() == queueSize);
       }
       bool empty(){
-        return wlQueue->empty();
+        return wlQueue.empty();
       }
       void push(PacketPtr pkt){
-        wlQueue->push(pkt);
+        wlQueue.push(pkt);
       }
       void pop(){
-        wlQueue->pop();
+        wlQueue.pop();
       }
-      void front(){
-        wlQueue.front());
+      PacketPtr front(){
+        return wlQueue.front();
       }
 
       WLQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class WLRespPort : public ResponsePort //From Push engine
@@ -83,7 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const override;
         void trySendRetry();
 
       protected:
@@ -129,50 +134,52 @@ class WLEngine : public ClockedObject
           return _blocked;
         }
 
-    protected:
-      virtual bool recvTimingResp(PacketPtr pkt);
-      void recvReqRetry() override;
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
+    System* const system;
+    const uint32_t queueSize;
+    const RequestorID requestorId;
+
+    WLReqPort reqPort;
+    WLRespPort respPort;
+    WLMemPort memPort;
+
     bool handleWLU(PacketPtr pkt);
     bool sendPacket();
     //one queue for write and one for read a priotizes write over read
     void readWLBuffer();
-    bool handleMemResp(PacketPtr resp);
 
 
     //Events
-    void processNextWLReadEvent();
     EventFunctionWrapper nextWLReadEvent;
+    void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void processNextWLReduceEvent();
     EventFunctionWrapper nextWLReduceEvent;
+    void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
        Perform apply and send the write request and read edgeList
        read + write
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    AddrRangeList getAddrRanges() const;
-    void recvFunctional(PacketPtr pkt);
-
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-    WLMemPort memPort;
-    WLRespPort respPort;
-    WLReqPort reqPort;
 
-   public:
 
+   public:
+    AddrRangeList getAddrRanges() const;
+    bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
+    void recvFunctional(PacketPtr pkt);
     WLEngine(const WLEngineParams &params);
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;

From 90800d55dd30af7e3fb47173bad39c3adf11ccbd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:24:55 -0800
Subject: [PATCH 024/287] Bug fix.

---
 src/accl/push_engine.cc | 26 ++++++++++++++++++++------
 src/accl/push_engine.hh | 13 ++++++++++++-
 src/accl/wl_engine.cc   |  9 ++-------
 src/accl/wl_engine.hh   |  3 +--
 4 files changed, 35 insertions(+), 16 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 6ebe34ebd3..746ed8a142 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -98,18 +98,32 @@ PushEngine::startup()
 
 }
 
-bool PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
+AddrRangeList
+PushEngine::PushRespPort::getAddrRanges()
+{
+    return owner->getAddrRanges();
+}
+
+bool
+PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
 {
     return owner->handleUpdate(pkt);
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+Tick
+PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
 {
-    return owner->getAddrRanges();
+    panic("recvAtomic unimpl.");
+}
+
+void
+PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
 }
 
-bool PushEngine::handleUpdate(PacketPtr pkt)
+bool
+PushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -131,7 +145,7 @@ bool PushEngine::handleUpdate(PacketPtr pkt)
 void PushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
-    uint8_t *data = updatePkt->getPtr<uint8_t>();
+    uint8_t* data = updatePkt->getPtr<uint8_t>();
 
     // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
     uint32_t edge_index = *((uint32_t *)data);
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 0acedd0da8..1aa70c7acb 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -58,8 +58,12 @@ class PushEngine : public ClockedObject
           _blocked(false), blockedPacket(nullptr)
         {}
         virtual AddrRangeList getAddrRanges();
-        virtual bool recvTimingReq(PacketPtr pkt);
 
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
     };
 
     class PushReqPort : public RequestPort
@@ -76,6 +80,8 @@ class PushEngine : public ClockedObject
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -95,6 +101,8 @@ class PushEngine : public ClockedObject
 
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
+
+      protected:
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
@@ -138,6 +146,8 @@ class PushEngine : public ClockedObject
 
     AddrRangeList getAddrRanges();
 
+    void recvFunctional(PacketPtr pkt);
+
   public:
 
     PushEngine(const PushEngineParams &params);
@@ -148,4 +158,5 @@ class PushEngine : public ClockedObject
 };
 
 }
+
 #endif // __ACCL_PUSH_ENGINE_HH__
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 9b16a15575..bfabed33e9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -76,12 +76,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -162,7 +156,8 @@ WLEngine::recvFunctional(PacketPtr pkt)
     memPort.recvFunctional(pkt);
 }
 
-bool WLEngine::handleWLUpdate(PacketPtr pkt){
+bool
+WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 8d02c16981..ad53fd7e7e 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -88,8 +88,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, SimObject* _owner,
               PortID id=InvalidPortID);
 
-        virtual AddrRangeList getAddrRanges() const override;
-        void trySendRetry();
+        virtual AddrRangeList getAddrRanges();
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From f62d592c1a5a1f7d397e025a6d9f8a8037a17e12 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:36:23 -0800
Subject: [PATCH 025/287] Bug fix.

---
 src/accl/push_engine.cc | 24 ++++++++++++++++++------
 src/accl/push_engine.hh | 13 +++++--------
 src/accl/wl_engine.cc   |  2 +-
 3 files changed, 24 insertions(+), 15 deletions(-)

diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index 746ed8a142..bf385818f5 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -122,6 +122,24 @@ PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
     owner->recvFunctional(pkt);
 }
 
+void
+PushEngine::PushRespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+AddrRangeList
+PushEngine::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+PushEngine::recvFunctional(PacketPtr pkt)
+{
+    memPort.sendFunctional(pkt);
+}
+
 bool
 PushEngine::handleUpdate(PacketPtr pkt)
 {
@@ -293,12 +311,6 @@ PushEngine::PushReqPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
 void
 PushEngine::PushMemPort::recvReqRetry()
 {
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 1aa70c7acb..269170c045 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -131,23 +131,20 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
-    bool handleUpdate(PacketPtr pkt);
-
-    bool handleMemResp(PacketPtr pkt);
-
-    AddrRangeList getAddrRanges();
-
-    void recvFunctional(PacketPtr pkt);
-
   public:
 
     PushEngine(const PushEngineParams &params);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index bfabed33e9..8365e754fc 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -153,7 +153,7 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    memPort.recvFunctional(pkt);
+    memPort.sendFunctional(pkt);
 }
 
 bool

From e4cbf3493f1179d195209bc0aa007c7cda112506 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:50:05 -0800
Subject: [PATCH 026/287] Bug fixes.

---
 src/accl/wl_engine.cc | 16 +++++++++++-----
 src/accl/wl_engine.hh |  6 +++---
 2 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 8365e754fc..872f38673e 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -41,11 +41,11 @@ WLEngine::WLEngine(const WLEngineParams &params):
     reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
-    nextWLReadEvent([this]{processNextWLReadEvent; }, name()),
-    nextWLReduceEvent([this]{processNextWLReduceEvent; }, name())
+    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
+    updateQueue(queueSize),
+    responseQueue(queueSize)
 {
-    updateQueue.resize(queueSize);
-    responseQueue.resize(queueSize);
 }
 
 Port &
@@ -88,6 +88,12 @@ WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
     panic("recvAtomic unimpl.");
 }
 
+void
+WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
 void
 WLEngine::WLRespPort::recvRespRetry()
 {
@@ -256,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ad53fd7e7e..fe26d22aef 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -172,14 +172,14 @@ class WLEngine : public ClockedObject
     WLQueue updateQueue;
     WLQueue responseQueue;
 
-
-
-   public:
     AddrRangeList getAddrRanges() const;
     bool handleWLUpdate(PacketPtr pkt);
     bool handleMemResp(PacketPtr resp);
     void recvFunctional(PacketPtr pkt);
+
+   public:
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };

From b1e3386565a90f3c4170c72688da1e7f01a3ef7f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 09:56:40 -0800
Subject: [PATCH 027/287] Bug fix.

---
 src/accl/push_engine.hh |  5 +----
 src/accl/wl_engine.cc   |  2 +-
 src/accl/wl_engine.hh   | 28 +++++++++++++---------------
 3 files changed, 15 insertions(+), 20 deletions(-)

diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index 269170c045..ea9026ff8f 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -49,13 +49,10 @@ class PushEngine : public ClockedObject
     {
       private:
         PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
         PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges();
 
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 872f38673e..98c940a2de 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -262,7 +262,7 @@ WLEngine::processNextWLReduceEvent(){
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
+            respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index fe26d22aef..94ac7c7aff 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -82,12 +82,11 @@ class WLEngine : public ClockedObject
     {
       private:
         WLEngine *owner;
-        PacketPtr blockedPacket;
 
       public:
-        WLRespPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
-
+        WLRespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
         virtual AddrRangeList getAddrRanges();
 
       protected:
@@ -105,12 +104,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLReqPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLReqPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         void recvReqRetry() override;
@@ -125,13 +124,12 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, SimObject* _owner,
-              PortID id=InvalidPortID);
+        WLMemPort(const std::string& name, WLEngine* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
-        bool blocked(){
-          return _blocked;
-        }
+        bool blocked() { return _blocked; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);

From 4541367e7f3091feb30a81c403cbdd9d1d1e9b0b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 13:44:39 -0800
Subject: [PATCH 028/287] Bug fix.

---
 src/accl/apply.cc       | 12 ------------
 src/accl/apply.hh       | 34 +++++++++++++++-------------------
 src/accl/push_engine.cc |  2 +-
 src/accl/push_engine.hh |  2 +-
 src/accl/wl_engine.hh   |  2 +-
 5 files changed, 18 insertions(+), 34 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 410eff5268..b493d3d152 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -75,12 +75,6 @@ bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
 void
 Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
 {
@@ -116,12 +110,6 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 Apply::ApplyMemPort::recvReqRetry()
 {
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index f08c1fef85..6ab639c552 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -64,7 +64,7 @@ class Apply : public ClockedObject
       }
 
       PacketPtr pop(){
-        return applyQueue->pop();
+        return applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -79,16 +79,12 @@ class Apply : public ClockedObject
     {
       private:
         Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
 
       public:
-        void trySendRetry();
-        virtual AddrRangeList getAddrRanges();
         ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
+          ResponsePort(name, owner), owner(owner)
         {}
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
@@ -140,16 +136,24 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
+    System* const system;
+    const RequestorID requestorId;
+
     ApplyMemPort memPort;
     ApplyRespPort respPort;
     ApplyReqPort reqPort;
 
+    ApplyQueue applyReadQueue;
+    ApplyQueue applyWriteQueue;
+
+    std::unordered_map<RequestPtr, int> requestOffset;
+
     bool handleWL(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readApplyBuffer();
+    // bool sendPacket();
+    // //one queue for write and one for read a priotizes write over read
+    // void readApplyBuffer();
     bool handleMemResp(PacketPtr resp);
-    void writePushBuffer();
+    // void writePushBuffer();
 
     //Events
     void processNextApplyCheckEvent();
@@ -166,16 +170,8 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    System* const system;
-    const RequestorID requestorId;
-
     AddrRangeList getAddrRanges() const;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index bf385818f5..fde79a5aa7 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -99,7 +99,7 @@ PushEngine::startup()
 }
 
 AddrRangeList
-PushEngine::PushRespPort::getAddrRanges()
+PushEngine::PushRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
diff --git a/src/accl/push_engine.hh b/src/accl/push_engine.hh
index ea9026ff8f..fbb7d6915a 100644
--- a/src/accl/push_engine.hh
+++ b/src/accl/push_engine.hh
@@ -54,7 +54,7 @@ class PushEngine : public ClockedObject
         PushRespPort(const std::string& name, PushEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 94ac7c7aff..504b63bc46 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -87,7 +87,7 @@ class WLEngine : public ClockedObject
         WLRespPort(const std::string& name, WLEngine* owner):
           ResponsePort(name, owner), owner(owner)
         {}
-        virtual AddrRangeList getAddrRanges();
+        virtual AddrRangeList getAddrRanges() const;
 
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);

From eb31d031f86ed681b6e974aeda16456daf0e67ef Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:02:08 -0800
Subject: [PATCH 029/287] Apply engine compiles

---
 src/accl/apply.cc | 33 +++++++++++++++++++++++++++------
 src/accl/apply.hh | 45 ++++++++++++++++++++++-----------------------
 2 files changed, 49 insertions(+), 29 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b493d3d152..55288693f3 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -41,10 +41,12 @@ Apply::Apply(const ApplyParams &params):
     respPort(name() + ".respPort", this),
     memPort(name() + ".memPort", this),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    applyReadQueue(params.applyQueueSize),
+    applyWriteQueue(params.applyQueueSize)
 {
-    applyReadQueue(params.applyQueueSize);
-    applyWriteQueue(params.applyQueueSize);
+    // applyReadQueue(params.applyQueueSize);
+    // applyWriteQueue(params.applyQueueSize);
 }
 
 Port &
@@ -62,7 +64,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 AddrRangeList
-Apply::ApplyRespPort::getAddrRanges()
+Apply::ApplyRespPort::getAddrRanges() const
 {
     return owner->getAddrRanges();
 }
@@ -93,6 +95,12 @@ Apply::ApplyRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+Apply::ApplyRespPort::trySendRetry()
+{
+    sendRetryReq();
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
@@ -118,6 +126,12 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+void
+Apply::ApplyMemPort::trySendRetry()
+{
+    sendRetryResp();
+}
+
 void
 Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
@@ -135,6 +149,12 @@ Apply::ApplyReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvRespRetry from response port is called.");
+}
+
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -158,7 +178,8 @@ bool Apply::handleWL(PacketPtr pkt){
 void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
-        auto pkt = queue.pop();
+        PacketPtr pkt = queue.front();
+        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
                 respPort.trySendRetry();
                 queue.sendPktRetry = false;
@@ -229,7 +250,7 @@ Apply::processNextApplyEvent(){
                 }
             }
         }else{
-            queue.pop();
+            queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
                 memPort.trySendRetry();
                 queue.sendPktRetry = false;
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 6ab639c552..7f17e173c6 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -63,8 +63,8 @@ class Apply : public ClockedObject
         applyQueue.push(pkt);
       }
 
-      PacketPtr pop(){
-        return applyQueue.pop();
+      void pop(){
+        applyQueue.pop();
       }
 
       PacketPtr front(){
@@ -72,20 +72,20 @@ class Apply : public ClockedObject
       }
 
       ApplyQueue(uint32_t qSize):
-        queueSize(qSize){}
+        queueSize(qSize),
+        sendPktRetry(false){}
     };
 
     class ApplyRespPort : public ResponsePort
     {
       private:
         Apply *owner;
-
       public:
         ApplyRespPort(const std::string& name, Apply* owner):
           ResponsePort(name, owner), owner(owner)
         {}
         virtual AddrRangeList getAddrRanges() const;
-
+        void trySendRetry();
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);
@@ -105,7 +105,6 @@ class Apply : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
@@ -139,9 +138,24 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
-    ApplyRespPort respPort;
     ApplyReqPort reqPort;
+    ApplyRespPort respPort;
+    ApplyMemPort memPort;
+
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -155,21 +169,6 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
-    //Events
-    void processNextApplyCheckEvent();
-    EventFunctionWrapper nextApplyCheckEvent;
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-    void processNextApplyEvent();
-    EventFunctionWrapper nextApplyEvent;
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
     AddrRangeList getAddrRanges() const;
 
   public:

From e3a7f1c1d727c2497e10003d781f404771345a5b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:22:29 -0800
Subject: [PATCH 030/287] Bug fix. Very close to first compilation.

---
 src/accl/apply.cc       |  60 +++++++------------
 src/accl/apply.hh       |  87 ++++++++++++++-------------
 src/accl/push_engine.cc | 126 ++++++++++++++++++++++------------------
 src/accl/util.hh        |  14 +++++
 src/accl/wl_engine.cc   |  22 ++-----
 src/accl/wl_engine.hh   |   3 +-
 6 files changed, 153 insertions(+), 159 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 55288693f3..9c3d3f1c3d 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -37,17 +39,14 @@ Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name()),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize)
-{
-    // applyReadQueue(params.applyQueueSize);
-    // applyWriteQueue(params.applyQueueSize);
-}
+    applyWriteQueue(params.applyQueueSize),
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+{}
 
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
@@ -96,22 +95,8 @@ Apply::ApplyRespPort::recvRespRetry()
 }
 
 void
-Apply::ApplyRespPort::trySendRetry()
-{
-    sendRetryReq();
-}
-
-bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
 {
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
         _blocked = true;
@@ -119,30 +104,27 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+Apply::ApplyReqPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-void
-Apply::ApplyMemPort::trySendRetry()
+bool
+Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
 {
-    sendRetryResp();
+    panic("recvTimingResp called on reqPort.");
 }
 
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
+bool
+Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyReqPort::recvReqRetry()
+Apply::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
@@ -179,9 +161,8 @@ void Apply::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
-        queue.pop();
         if (queue.sendPktRetry && !queue.blocked()){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 queue.sendPktRetry = false;
         }
         // conver to ReadReq
@@ -190,7 +171,8 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+        memPort.sendPacke:(memPkt);
+        queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -245,14 +227,14 @@ Apply::processNextApplyEvent(){
                 reqPort.sendPacket(writePkt);
                 queue.pop();
                 if (queue.sendPktRetry && !queue.blocked()){
-                    memPort.trySendRetry();
+                    // memPort.trySendRetry();
                     queue.sendPktRetry = false;
                 }
             }
         }else{
             queue.applyQueue.pop();
             if (queue.sendPktRetry && !queue.blocked()){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
         }
diff --git a/src/accl/apply.hh b/src/accl/apply.hh
index 7f17e173c6..2a16632e22 100644
--- a/src/accl/apply.hh
+++ b/src/accl/apply.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -49,31 +48,31 @@ class Apply : public ClockedObject
   private:
 
     struct ApplyQueue{
-      std::queue<PacketPtr> applyQueue;
-      const uint32_t queueSize;
-      bool sendPktRetry;
-
-      bool blocked(){
-        return (applyQueue.size() == queueSize);
-      }
-      bool empty(){
-        return applyQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        applyQueue.push(pkt);
-      }
-
-      void pop(){
-        applyQueue.pop();
-      }
-
-      PacketPtr front(){
-        return applyQueue.front();
-      }
-
-      ApplyQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
+        std::queue<PacketPtr> applyQueue;
+        const uint32_t queueSize;
+        bool sendPktRetry;
+
+        bool blocked(){
+            return (applyQueue.size() == queueSize);
+        }
+        bool empty(){
+            return applyQueue.empty();
+        }
+        void push(PacketPtr pkt){
+            applyQueue.push(pkt);
+        }
+
+        void pop(){
+            applyQueue.pop();
+        }
+
+        PacketPtr front(){
+            return applyQueue.front();
+        }
+
+        ApplyQueue(uint32_t qSize):
+          queueSize(qSize)
+        {}
     };
 
     class ApplyRespPort : public ResponsePort
@@ -109,8 +108,8 @@ class Apply : public ClockedObject
         bool blocked() { return _blocked; }
 
       protected:
-        void recvReqRetry() override;
         virtual bool recvTimingResp(PacketPtr pkt);
+        void recvReqRetry() override;
     };
 
     class ApplyMemPort : public RequestPort
@@ -127,7 +126,7 @@ class Apply : public ClockedObject
         {}
 
         void sendPacket(PacketPtr pkt);
-        void trySendRetry();
+        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -138,25 +137,10 @@ class Apply : public ClockedObject
     System* const system;
     const RequestorID requestorId;
 
-    ApplyReqPort reqPort;
     ApplyRespPort respPort;
+    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -169,6 +153,21 @@ class Apply : public ClockedObject
     bool handleMemResp(PacketPtr resp);
     // void writePushBuffer();
 
+    //Events
+    EventFunctionWrapper nextApplyCheckEvent;
+    void processNextApplyCheckEvent();
+    /* Syncronously checked
+       If there are any active vertecies:
+       create memory read packets + MPU::MPU::MemPortsendTimingReq
+    */
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
+       Perform apply and send the write request and read edgeList
+       read + write
+       Write edgelist loc in buffer
+    */
+
     AddrRangeList getAddrRanges() const;
 
   public:
diff --git a/src/accl/push_engine.cc b/src/accl/push_engine.cc
index fde79a5aa7..125433653b 100644
--- a/src/accl/push_engine.cc
+++ b/src/accl/push_engine.cc
@@ -26,9 +26,10 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
 #include "accl/push_engine.hh"
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -128,6 +129,68 @@ PushEngine::PushRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
+void
+PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::PushReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+PushEngine::PushMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 AddrRangeList
 PushEngine::getAddrRanges()
 {
@@ -224,24 +287,8 @@ void PushEngine::processNextReadEvent()
     }
 }
 
-bool PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool PushEngine::handleMemResp(PacketPtr pkt)
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -259,7 +306,8 @@ bool PushEngine::handleMemResp(PacketPtr pkt)
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
         updateQueue.push(update);
     }
 
@@ -286,42 +334,4 @@ void PushEngine::processNextSendEvent()
     }
 }
 
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-PushEngine::PushMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 }
diff --git a/src/accl/util.hh b/src/accl/util.hh
index 91692488a4..b3cff93f15 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -26,6 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
 #include "mem/request.hh"
@@ -39,12 +40,25 @@ struct WorkListItem
     uint32_t prop;
     uint32_t degree;
     uint32_t edgeIndex;
+
+    std::string to_string()
+    {
+        return csprintf(
+        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        temp_prop, prop, degree, edgeIndex);
+    }
+
 };
 
 struct Edge
 {
     uint64_t weight;
     Addr neighbor;
+
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
+    }
 };
 
 WorkListItem memoryToWorkList(uint8_t* data);
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 98c940a2de..eb883cb19b 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -30,6 +30,8 @@
 
 #include <string>
 
+#include "accl/util.hh"
+
 namespace gem5
 {
 
@@ -76,12 +78,6 @@ bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
     return true;
 }
 
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
 Tick
 WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
 {
@@ -125,12 +121,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-void
-WLEngine::WLMemPort::trySendRetry()
-{
-    sendRetryResp();
-}
-
 void
 WLEngine::WLReqPort::recvReqRetry()
 {
@@ -244,12 +234,12 @@ WLEngine::processNextWLReduceEvent(){
             applyPort.sendPacket(writePkt);
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                memPort.trySendRetry();
+                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
             if (!updateQ.blocked() & updateQ.sendPktRetry){
-                respPort.trySendRetry();
+                // respPort.trySendRetry();
                 updateQ.sendPktRetry = false;
             }
         }
@@ -257,12 +247,12 @@ WLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            memPort.trySendRetry();
+            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            respPort.trySendRetry();
+            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index 504b63bc46..ee25154caa 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -32,7 +32,6 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -140,8 +139,8 @@ class WLEngine : public ClockedObject
     const uint32_t queueSize;
     const RequestorID requestorId;
 
-    WLReqPort reqPort;
     WLRespPort respPort;
+    WLReqPort reqPort;
     WLMemPort memPort;
 
     bool handleWLU(PacketPtr pkt);

From 099f68905a083c566dcb1334b9c1b4fae3c1edcf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:46:20 -0800
Subject: [PATCH 031/287] More bug fixes.

---
 src/accl/apply.cc     |  8 +-------
 src/accl/util.hh      |  3 ++-
 src/accl/wl_engine.cc | 12 +++++-------
 src/accl/wl_engine.hh | 18 ++++++------------
 4 files changed, 14 insertions(+), 27 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index 9c3d3f1c3d..b18c990da2 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -131,12 +131,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 AddrRangeList
 Apply::getAddrRanges() const
 {
@@ -171,7 +165,7 @@ void Apply::processNextApplyCheckEvent(){
         RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
         PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
         requestOffset[request] = req_offset;
-        memPort.sendPacke:(memPkt);
+        memPort.sendPacket(memPkt);
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
diff --git a/src/accl/util.hh b/src/accl/util.hh
index b3cff93f15..a4418a1cb8 100644
--- a/src/accl/util.hh
+++ b/src/accl/util.hh
@@ -71,6 +71,7 @@ PacketPtr getReadPacket(Addr addr, unsigned int size,
                             RequestorID requestorId);
 PacketPtr getWritePacket(Addr addr, unsigned int size,
                 uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+PacketPtr getUpdatePacket(Addr addr, unsigned int size,
+                uint8_t *data, RequestorID requestorId);
 
 }
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index eb883cb19b..614f34d175 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -38,17 +38,15 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
     system(params.system),
-    queueSize(params.wlQueueSize),
     requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
     respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
+    updateQueue(params.wlQueueSize),
+    responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name()),
-    updateQueue(queueSize),
-    responseQueue(queueSize)
-{
-}
+    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
+{}
 
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
diff --git a/src/accl/wl_engine.hh b/src/accl/wl_engine.hh
index ee25154caa..57cc063880 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/wl_engine.hh
@@ -136,26 +136,26 @@ class WLEngine : public ClockedObject
     };
 
     System* const system;
-    const uint32_t queueSize;
     const RequestorID requestorId;
 
     WLRespPort respPort;
     WLReqPort reqPort;
     WLMemPort memPort;
 
-    bool handleWLU(PacketPtr pkt);
-    bool sendPacket();
-    //one queue for write and one for read a priotizes write over read
-    void readWLBuffer();
+    WLQueue updateQueue;
+    WLQueue responseQueue;
 
+    std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -164,14 +164,8 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    std::unordered_map<RequestPtr, int> requestOffset;
-
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
     AddrRangeList getAddrRanges() const;
-    bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
     void recvFunctional(PacketPtr pkt);
 
    public:

From 793d75564e15d66b6d8e81f2a75dfd324465eb41 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 15 Feb 2022 17:53:21 -0800
Subject: [PATCH 032/287] Compilation. yeay.

---
 src/accl/apply.cc     |  9 +++++++++
 src/accl/wl_engine.cc | 40 +++++++++++++++++++++++-----------------
 2 files changed, 32 insertions(+), 17 deletions(-)

diff --git a/src/accl/apply.cc b/src/accl/apply.cc
index b18c990da2..40002c5264 100644
--- a/src/accl/apply.cc
+++ b/src/accl/apply.cc
@@ -117,6 +117,15 @@ Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
     panic("recvTimingResp called on reqPort.");
 }
 
+void
+Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
 bool
 Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
diff --git a/src/accl/wl_engine.cc b/src/accl/wl_engine.cc
index 614f34d175..d2ecd0d7c9 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/wl_engine.cc
@@ -94,17 +94,14 @@ WLEngine::WLRespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLReqPort::recvTimingResp(PacketPtr)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    panic("recvTimingResp called on the request port.");
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+WLEngine::WLReqPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -113,14 +110,26 @@ WLEngine::WLMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+void
+WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt);
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
 }
 
 void
-WLEngine::WLReqPort::recvReqRetry()
+WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+{
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+WLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -129,13 +138,10 @@ WLEngine::WLReqPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
+bool
+WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
+    return owner->handleMemResp(pkt);
 }
 
 AddrRangeList

From 5e05fe3d6caa51cada748e2dc6e2e200c84932c7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Feb 2022 10:31:28 -0800
Subject: [PATCH 033/287] Fixing a typo.

---
 src/accl/PushEngine.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/PushEngine.py b/src/accl/PushEngine.py
index 3215fdaee2..840d8dea1f 100644
--- a/src/accl/PushEngine.py
+++ b/src/accl/PushEngine.py
@@ -30,7 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class PushEngine(ClockedObject):
-    type = 'WLEngine'
+    type = 'PushEngine'
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 

From f35e40e74c7b42f5cd3ffc68b89ef2a714f5dab9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 18 Feb 2022 14:08:41 -0800
Subject: [PATCH 034/287] Restructuring the directory.

---
 src/accl/{ => graph/base}/Apply.py       |  0
 src/accl/{ => graph/base}/PushEngine.py  |  0
 src/accl/{ => graph/base}/SConscript     |  0
 src/accl/{ => graph/base}/WLEngine.py    |  0
 src/accl/{ => graph/base}/apply.cc       | 73 +--------------------
 src/accl/{ => graph/base}/apply.hh       | 44 +------------
 src/accl/{ => graph/base}/push_engine.cc |  0
 src/accl/{ => graph/base}/push_engine.hh |  0
 src/accl/{ => graph/base}/util.cc        |  0
 src/accl/{ => graph/base}/util.hh        |  0
 src/accl/{ => graph/base}/wl_engine.cc   | 83 +-----------------------
 src/accl/{ => graph/base}/wl_engine.hh   | 49 +-------------
 src/accl/graph/sega/mpu.hh               |  0
 13 files changed, 7 insertions(+), 242 deletions(-)
 rename src/accl/{ => graph/base}/Apply.py (100%)
 rename src/accl/{ => graph/base}/PushEngine.py (100%)
 rename src/accl/{ => graph/base}/SConscript (100%)
 rename src/accl/{ => graph/base}/WLEngine.py (100%)
 rename src/accl/{ => graph/base}/apply.cc (80%)
 rename src/accl/{ => graph/base}/apply.hh (78%)
 rename src/accl/{ => graph/base}/push_engine.cc (100%)
 rename src/accl/{ => graph/base}/push_engine.hh (100%)
 rename src/accl/{ => graph/base}/util.cc (100%)
 rename src/accl/{ => graph/base}/util.hh (100%)
 rename src/accl/{ => graph/base}/wl_engine.cc (79%)
 rename src/accl/{ => graph/base}/wl_engine.hh (75%)
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/src/accl/Apply.py b/src/accl/graph/base/Apply.py
similarity index 100%
rename from src/accl/Apply.py
rename to src/accl/graph/base/Apply.py
diff --git a/src/accl/PushEngine.py b/src/accl/graph/base/PushEngine.py
similarity index 100%
rename from src/accl/PushEngine.py
rename to src/accl/graph/base/PushEngine.py
diff --git a/src/accl/SConscript b/src/accl/graph/base/SConscript
similarity index 100%
rename from src/accl/SConscript
rename to src/accl/graph/base/SConscript
diff --git a/src/accl/WLEngine.py b/src/accl/graph/base/WLEngine.py
similarity index 100%
rename from src/accl/WLEngine.py
rename to src/accl/graph/base/WLEngine.py
diff --git a/src/accl/apply.cc b/src/accl/graph/base/apply.cc
similarity index 80%
rename from src/accl/apply.cc
rename to src/accl/graph/base/apply.cc
index 40002c5264..eae9c2fd16 100644
--- a/src/accl/apply.cc
+++ b/src/accl/graph/base/apply.cc
@@ -30,17 +30,13 @@
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 Apply::Apply(const ApplyParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -51,72 +47,13 @@ Apply::Apply(const ApplyParams &params):
 Port &
 Apply::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-Apply::ApplyRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool Apply::ApplyRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWL(pkt)){
-        return false;
-    }
-    return true;
-}
-
-void
-Apply::ApplyRespPort::recvFunctional(PacketPtr pkt)
-{
-    panic("Not implemented");
-}
-
-Tick
-Apply::ApplyRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-Apply::ApplyRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-Apply::ApplyReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-Apply::ApplyReqPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-Apply::ApplyReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on reqPort.");
-}
-
 void
 Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
@@ -140,12 +77,6 @@ Apply::ApplyMemPort::recvReqRetry()
     blockedPacket = nullptr;
 }
 
-AddrRangeList
-Apply::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
 bool Apply::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
diff --git a/src/accl/apply.hh b/src/accl/graph/base/apply.hh
similarity index 78%
rename from src/accl/apply.hh
rename to src/accl/graph/base/apply.hh
index 2a16632e22..a3f0ff5aa3 100644
--- a/src/accl/apply.hh
+++ b/src/accl/graph/base/apply.hh
@@ -46,7 +46,7 @@ namespace gem5
 class Apply : public ClockedObject
 {
   private:
-
+    //FIXME: Remove queue defenition from here.
     struct ApplyQueue{
         std::queue<PacketPtr> applyQueue;
         const uint32_t queueSize;
@@ -75,43 +75,6 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyRespPort : public ResponsePort
-    {
-      private:
-        Apply *owner;
-      public:
-        ApplyRespPort(const std::string& name, Apply* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-        void trySendRetry();
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class ApplyReqPort : public RequestPort
-    {
-      private:
-        Apply *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ApplyReqPort(const std::string& name, Apply* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     class ApplyMemPort : public RequestPort
     {
       private:
@@ -134,11 +97,8 @@ class Apply : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
     const RequestorID requestorId;
 
-    ApplyRespPort respPort;
-    ApplyReqPort reqPort;
     ApplyMemPort memPort;
 
     ApplyQueue applyReadQueue;
@@ -168,8 +128,6 @@ class Apply : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
   public:
     Apply(const ApplyParams &apply);
 
diff --git a/src/accl/push_engine.cc b/src/accl/graph/base/push_engine.cc
similarity index 100%
rename from src/accl/push_engine.cc
rename to src/accl/graph/base/push_engine.cc
diff --git a/src/accl/push_engine.hh b/src/accl/graph/base/push_engine.hh
similarity index 100%
rename from src/accl/push_engine.hh
rename to src/accl/graph/base/push_engine.hh
diff --git a/src/accl/util.cc b/src/accl/graph/base/util.cc
similarity index 100%
rename from src/accl/util.cc
rename to src/accl/graph/base/util.cc
diff --git a/src/accl/util.hh b/src/accl/graph/base/util.hh
similarity index 100%
rename from src/accl/util.hh
rename to src/accl/graph/base/util.hh
diff --git a/src/accl/wl_engine.cc b/src/accl/graph/base/wl_engine.cc
similarity index 79%
rename from src/accl/wl_engine.cc
rename to src/accl/graph/base/wl_engine.cc
index d2ecd0d7c9..dc8f1dd744 100644
--- a/src/accl/wl_engine.cc
+++ b/src/accl/graph/base/wl_engine.cc
@@ -26,21 +26,17 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/wl_engine.hh"
+#include "accl/graph/base/wl_engine.hh"
 
 #include <string>
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -51,74 +47,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
 Port &
 WLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
-AddrRangeList
-WLEngine::WLRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool WLEngine::WLRespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleWLUpdate(pkt)){
-        return false;
-    }
-    return true;
-}
-
-Tick
-WLEngine::WLRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::WLRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::WLRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-bool
-WLEngine::WLReqPort::recvTimingResp(PacketPtr)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-WLEngine::WLReqPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-void
-WLEngine::WLReqPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
 void
 WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
@@ -144,18 +79,6 @@ WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
     return owner->handleMemResp(pkt);
 }
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return memPort.getAddrRanges();
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
 WLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
diff --git a/src/accl/wl_engine.hh b/src/accl/graph/base/wl_engine.hh
similarity index 75%
rename from src/accl/wl_engine.hh
rename to src/accl/graph/base/wl_engine.hh
index 57cc063880..3654999b70 100644
--- a/src/accl/wl_engine.hh
+++ b/src/accl/graph/base/wl_engine.hh
@@ -46,7 +46,7 @@ namespace gem5
 class WLEngine : public ClockedObject
 {
   private:
-
+    //FIXME: Change this
     struct WLQueue{
       std::queue<PacketPtr> wlQueue;
       uint32_t queueSize;
@@ -77,44 +77,6 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLRespPort : public ResponsePort //From Push engine
-    {
-      private:
-        WLEngine *owner;
-
-      public:
-        WLRespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class WLReqPort : public RequestPort //To Apply Engine
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        WLReqPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        void recvReqRetry() override;
-        virtual bool recvTimingResp(PacketPtr pkt);
-    };
-
     class WLMemPort : public RequestPort
     {
       private:
@@ -135,11 +97,6 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    System* const system;
-    const RequestorID requestorId;
-
-    WLRespPort respPort;
-    WLReqPort reqPort;
     WLMemPort memPort;
 
     WLQueue updateQueue;
@@ -164,10 +121,6 @@ class WLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
 
-    AddrRangeList getAddrRanges() const;
-
-    void recvFunctional(PacketPtr pkt);
-
    public:
     WLEngine(const WLEngineParams &params);
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..e69de29bb2

From d02f3824f8f6fc41ae6cff87bfccff497405d78a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 09:59:09 -0800
Subject: [PATCH 035/287] Restructing the classes.

---
 src/accl/graph/base/Apply.py      |  5 +---
 src/accl/graph/base/PushEngine.py |  3 --
 src/accl/graph/base/WLEngine.py   |  5 +---
 src/accl/graph/sega/MPU.py        | 46 +++++++++++++++++++++++++++++++
 4 files changed, 48 insertions(+), 11 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/Apply.py
index 8720287cc8..80aa430139 100644
--- a/src/accl/graph/base/Apply.py
+++ b/src/accl/graph/base/Apply.py
@@ -34,8 +34,5 @@ class Apply(ClockedObject):
     cxx_header = "accl/apply.hh"
     cxx_class = 'gem5::Apply'
 
-    system = Param.System(Parent.any, "The system object this apply engine is a part of")
-    respPort = ResponsePort("Receives requests from WorkList")
-    reqPort  = RequestPort("Sends requests to Push")
-    memPort  = RequestPort("Memory side port, sends requests")
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/base/PushEngine.py
index 840d8dea1f..7fef165169 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/base/PushEngine.py
@@ -34,7 +34,4 @@ class PushEngine(ClockedObject):
     cxx_header = "accl/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    system = Param.System(Parent.any, "The system object this push engine is a part of")
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/base/WLEngine.py
index 562fd04423..deaee20935 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/base/WLEngine.py
@@ -34,8 +34,5 @@ class WLEngine(ClockedObject):
     cxx_header = "accl/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    system = Param.System(Parent.any, "The system object this push WorkList is a part of")
-    respPort = ResponsePort("Receives updates")
-    reqPort  = RequestPort("Sends requests to Apply")
-    memPort  = RequestPort("Memory side port, sends requests")
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..b6e136dda5
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,46 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
+
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    workListEngine = Param.WLEngine("WLEngine object to connect to "
+                    "This MPU")
+    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+                    "This MPU")
+    pushEngine = Param.PushEngine("PushEngine object to connect to "
+                    "This MPU")

From bfb12794aa99858bb88afab45640cc27c90bde76 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:12:50 -0800
Subject: [PATCH 036/287] Sperating WLEngine and BaseWLEngine + few changes in
 BaseApplyEngine

---
 .../base/{Apply.py => BaseApplyEngine.py}     |  0
 src/accl/graph/base/BaseWLEngine.py           | 38 ++++++++++++++++++
 .../base/{apply.cc => base_apply_engine.cc}   | 20 +++++-----
 .../base/{apply.hh => base_apply_engine.hh}   | 35 +++++-----------
 .../base/{wl_engine.cc => base_wl_engine.cc}  | 20 +++++-----
 .../base/{wl_engine.hh => base_wl_engine.hh}  | 13 +++---
 src/accl/graph/sega/ApplyEngine.py            | 40 +++++++++++++++++++
 src/accl/graph/{base => sega}/WLEngine.py     | 12 +++---
 src/accl/graph/sega/apply_engine.cc           |  0
 src/accl/graph/sega/apply_engine.hh           |  0
 src/accl/graph/sega/wl_engine.cc              |  0
 src/accl/graph/sega/wl_engine.hh              |  0
 12 files changed, 120 insertions(+), 58 deletions(-)
 rename src/accl/graph/base/{Apply.py => BaseApplyEngine.py} (100%)
 create mode 100644 src/accl/graph/base/BaseWLEngine.py
 rename src/accl/graph/base/{apply.cc => base_apply_engine.cc} (91%)
 rename src/accl/graph/base/{apply.hh => base_apply_engine.hh} (79%)
 rename src/accl/graph/base/{wl_engine.cc => base_wl_engine.cc} (91%)
 rename src/accl/graph/base/{wl_engine.hh => base_wl_engine.hh} (93%)
 create mode 100644 src/accl/graph/sega/ApplyEngine.py
 rename src/accl/graph/{base => sega}/WLEngine.py (84%)
 create mode 100644 src/accl/graph/sega/apply_engine.cc
 create mode 100644 src/accl/graph/sega/apply_engine.hh
 create mode 100644 src/accl/graph/sega/wl_engine.cc
 create mode 100644 src/accl/graph/sega/wl_engine.hh

diff --git a/src/accl/graph/base/Apply.py b/src/accl/graph/base/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/Apply.py
rename to src/accl/graph/base/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
new file mode 100644
index 0000000000..7384e876ef
--- /dev/null
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseWLEngine(ClockedObject):
+    type = 'BaseWLEngine'
+    cxx_header = "accl/base_wl_engine.hh"
+    cxx_class = 'gem5::BaseWLEngine'
+
+    wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/apply.cc b/src/accl/graph/base/base_apply_engine.cc
similarity index 91%
rename from src/accl/graph/base/apply.cc
rename to src/accl/graph/base/base_apply_engine.cc
index eae9c2fd16..c88d14a2c2 100644
--- a/src/accl/graph/base/apply.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/apply.hh"
+#include "accl/base_apply_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-Apply::Apply(const ApplyParams &params):
+BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
@@ -45,7 +45,7 @@ Apply::Apply(const ApplyParams &params):
 {}
 
 Port &
-Apply::getPort(const std::string &if_name, PortID idx)
+BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ Apply::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,20 +64,20 @@ Apply::ApplyMemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-Apply::ApplyMemPort::recvTimingResp(PacketPtr pkt)
+BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-Apply::ApplyMemPort::recvReqRetry()
+BaseApplyEngine::ApplyMemPort::recvReqRetry()
 {
     _blocked = false;
     sendPacket(blockedPacket);
     blockedPacket = nullptr;
 }
 
-bool Apply::handleWL(PacketPtr pkt){
+bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -91,7 +91,7 @@ bool Apply::handleWL(PacketPtr pkt){
     return true;
 }
 
-void Apply::processNextApplyCheckEvent(){
+void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
     if (!memPort.blocked()){
         PacketPtr pkt = queue.front();
@@ -114,7 +114,7 @@ void Apply::processNextApplyCheckEvent(){
 }
 
 bool
-Apply::handleMemResp(PacketPtr pkt)
+BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = applyWriteQueue;
 
@@ -132,7 +132,7 @@ Apply::handleMemResp(PacketPtr pkt)
 }
 
 void
-Apply::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent(){
     auto queue = applyWriteQueue;
         PacketPtr pkt = queue.front();
         uint8_t* data = pkt->getPtr<uint8_t>();
diff --git a/src/accl/graph/base/apply.hh b/src/accl/graph/base/base_apply_engine.hh
similarity index 79%
rename from src/accl/graph/base/apply.hh
rename to src/accl/graph/base/base_apply_engine.hh
index a3f0ff5aa3..c2d2f26387 100644
--- a/src/accl/graph/base/apply.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,18 +32,16 @@
 #include <queue>
 #include <unordered_map>
 
-#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/Apply.hh"
+#include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class Apply : public ClockedObject
+class BaseApplyEngine : public ClockedObject
 {
   private:
     //FIXME: Remove queue defenition from here.
@@ -75,21 +73,20 @@ class Apply : public ClockedObject
         {}
     };
 
-    class ApplyMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
-        Apply *owner;
+        BaseApplyEngine *owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        ApplyMemPort(const std::string& name, Apply* owner):
+        MemPort(const std::string& name, BaseApplyEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
 
         void sendPacket(PacketPtr pkt);
-        // void trySendRetry();
         bool blocked(){ return _blocked;}
 
       protected:
@@ -99,7 +96,7 @@ class Apply : public ClockedObject
 
     const RequestorID requestorId;
 
-    ApplyMemPort memPort;
+    MemPort memPort;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -107,29 +104,15 @@ class Apply : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     bool handleWL(PacketPtr pkt);
-    // bool sendPacket();
-    // //one queue for write and one for read a priotizes write over read
-    // void readApplyBuffer();
-    bool handleMemResp(PacketPtr resp);
-    // void writePushBuffer();
-
-    //Events
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
+
+    bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
 
   public:
-    Apply(const ApplyParams &apply);
+    BaseApplyEngine(const ApplyParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/base/wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
similarity index 91%
rename from src/accl/graph/base/wl_engine.cc
rename to src/accl/graph/base/base_wl_engine.cc
index dc8f1dd744..7261069c17 100644
--- a/src/accl/graph/base/wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/wl_engine.hh"
+#include "accl/graph/base/base_wl_engine.hh"
 
 #include <string>
 
@@ -35,7 +35,7 @@
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
@@ -45,7 +45,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 {}
 
 Port &
-WLEngine::getPort(const std::string &if_name, PortID idx)
+BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "memPort") {
         return memPort;
@@ -55,7 +55,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
+BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
     if (!sendTimingReq(pkt)) {
         blockedPacket = pkt;
@@ -64,7 +64,7 @@ WLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-WLEngine::WLMemPort::recvReqRetry()
+BaseWLEngine::WLMemPort::recvReqRetry()
 {
     // We should have a blocked packet if this function is called.
     assert(_blocked && blockedPacket != nullptr);
@@ -74,13 +74,13 @@ WLEngine::WLMemPort::recvReqRetry()
 }
 
 bool
-WLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
+BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 bool
-WLEngine::handleWLUpdate(PacketPtr pkt){
+BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
     if (queue.blocked()){
         queue.sendPktRetry = true;
@@ -94,7 +94,7 @@ WLEngine::handleWLUpdate(PacketPtr pkt){
     return true;
 }
 
-void WLEngine::processNextWLReadEvent(){
+void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
     while (!queue.empty()){ //create a map instead of front
         PacketPtr pkt = queue.front();
@@ -117,7 +117,7 @@ void WLEngine::processNextWLReadEvent(){
 }
 
 bool
-WLEngine::handleMemResp(PacketPtr pkt)
+BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
     auto queue = responseQueue;
         if (queue.blocked()){
@@ -134,7 +134,7 @@ WLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-WLEngine::processNextWLReduceEvent(){
+BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
     auto applyPort = reqPort;
diff --git a/src/accl/graph/base/wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
similarity index 93%
rename from src/accl/graph/base/wl_engine.hh
rename to src/accl/graph/base/base_wl_engine.hh
index 3654999b70..2095a20f1b 100644
--- a/src/accl/graph/base/wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/WLEngine.hh"
+#include "params/BaseWLEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-class WLEngine : public ClockedObject
+class BaseWLEngine : public ClockedObject
 {
   private:
     //FIXME: Change this
@@ -77,7 +77,7 @@ class WLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class WLMemPort : public RequestPort
+    class MemPort : public RequestPort
     {
       private:
         WLEngine *owner;
@@ -85,7 +85,7 @@ class WLEngine : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        WLMemPort(const std::string& name, WLEngine* owner):
+        MemPort(const std::string& name, WLEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -97,8 +97,7 @@ class WLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
-    WLMemPort memPort;
-
+    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
@@ -122,7 +121,7 @@ class WLEngine : public ClockedObject
     */
 
    public:
-    WLEngine(const WLEngineParams &params);
+    BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
new file mode 100644
index 0000000000..0d03e71e54
--- /dev/null
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from build.NULL.python.m5.proxy import Parent
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseApplyEngine import BaseApplyEngine
+
+class ApplyEngine(BaseApplyEngine):
+    type = 'ApplyEngine'
+    cxx_header = "accl/graph/sega/apply_engine.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/base/WLEngine.py b/src/accl/graph/sega/WLEngine.py
similarity index 84%
rename from src/accl/graph/base/WLEngine.py
rename to src/accl/graph/sega/WLEngine.py
index deaee20935..a8f3bd20ea 100644
--- a/src/accl/graph/base/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,14 +25,16 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
+from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
+# FIXME: update these to correct files
+from m5.objects.BaseWLEngine import BaseWLEngine
 
-class WLEngine(ClockedObject):
+class WLEngine(BaseWLEngine):
     type = 'WLEngine'
-    cxx_header = "accl/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::MPU'
 
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From bfdec933f77713641144d1a2bd4fa1c4aec53faa Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:25:17 -0800
Subject: [PATCH 037/287] Restructuring classes.

---
 src/accl/graph/base/BasePushEngine.py         |  37 ++++++
 src/accl/graph/base/SConscript                |   4 +-
 .../{push_engine.cc => base_push_engine.cc}   | 125 +++++-------------
 .../{push_engine.hh => base_push_engine.hh}   |  66 ++-------
 src/accl/graph/sega/MPU.py                    |   2 +-
 src/accl/graph/{base => sega}/PushEngine.py   |  14 +-
 src/accl/graph/sega/push_engine.cc            |   0
 src/accl/graph/sega/push_engine.hh            |   0
 8 files changed, 90 insertions(+), 158 deletions(-)
 create mode 100644 src/accl/graph/base/BasePushEngine.py
 rename src/accl/graph/base/{push_engine.cc => base_push_engine.cc} (77%)
 rename src/accl/graph/base/{push_engine.hh => base_push_engine.hh} (66%)
 rename src/accl/graph/{base => sega}/PushEngine.py (83%)
 create mode 100644 src/accl/graph/sega/push_engine.cc
 create mode 100644 src/accl/graph/sega/push_engine.hh

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
new file mode 100644
index 0000000000..6ed5d25978
--- /dev/null
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BasePushEngine(ClockedObject):
+    type = 'BasePushEngine'
+    cxx_header = "accl/graph/base/base_push_engine.hh"
+    cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 18ac71eb7d..a881fa1e6e 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,10 @@
 Import('*')
 
 SimObject('Apply.py')
-SimObject('PushEngine.py')
+SimObject('BasePushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply.cc')
-Source('push_engine.cc')
+Source('base_push_engine.cc')
 Source('wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/push_engine.cc b/src/accl/graph/base/base_push_engine.cc
similarity index 77%
rename from src/accl/graph/base/push_engine.cc
rename to src/accl/graph/base/base_push_engine.cc
index 125433653b..9fbc89221f 100644
--- a/src/accl/graph/base/push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -26,18 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/push_engine.hh"
+#include "accl/graph/base/base_push_engine.hh"
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
-    system(params.system),
-    requestorId(system->getRequestorId(this)),
-    reqPort(name() + ".reqPort", this),
-    respPort(name() + ".respPort", this),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
@@ -50,21 +47,29 @@ PushEngine::PushEngine(const PushEngineParams &params) : ClockedObject(params),
 }
 
 Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
+BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "memPort") {
+    if (if_name == "memPort") {
         return memPort;
     } else {
         return SimObject::getPort(if_name, idx);
     }
 }
 
+RequestorID
+BasePushEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BasePushEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
-PushEngine::startup()
+BasePushEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
@@ -99,75 +104,14 @@ PushEngine::startup()
 
 }
 
-AddrRangeList
-PushEngine::PushRespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-PushEngine::PushRespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleUpdate(pkt);
-}
-
-Tick
-PushEngine::PushRespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-PushEngine::PushRespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-PushEngine::PushRespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-PushEngine::PushReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::PushReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::PushReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
 bool
-PushEngine::PushMemPort::recvTimingResp(PacketPtr pkt)
+BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     return owner->handleMemResp(pkt);
 }
 
 void
-PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
+BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -179,7 +123,7 @@ PushEngine::PushMemPort::sendPacket(PacketPtr pkt)
 }
 
 void
-PushEngine::PushMemPort::recvReqRetry()
+BasePushEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -191,20 +135,8 @@ PushEngine::PushMemPort::recvReqRetry()
     }
 }
 
-AddrRangeList
-PushEngine::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-PushEngine::recvFunctional(PacketPtr pkt)
-{
-    memPort.sendFunctional(pkt);
-}
-
 bool
-PushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::handleUpdate(PacketPtr pkt)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -223,7 +155,8 @@ PushEngine::handleUpdate(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextReceiveEvent()
+void
+BasePushEngine::processNextReceiveEvent()
 {
     PacketPtr updatePkt = vertexQueue.front();
     uint8_t* data = updatePkt->getPtr<uint8_t>();
@@ -274,7 +207,8 @@ void PushEngine::processNextReceiveEvent()
     }
 }
 
-void PushEngine::processNextReadEvent()
+void
+BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
     if (!memPort.blocked()) {
@@ -288,7 +222,7 @@ void PushEngine::processNextReadEvent()
 }
 
 bool
-PushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::handleMemResp(PacketPtr pkt)
 {
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
@@ -321,7 +255,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-void PushEngine::processNextSendEvent()
+void
+BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
     if (!reqPort.blocked()) {
diff --git a/src/accl/graph/base/push_engine.hh b/src/accl/graph/base/base_push_engine.hh
similarity index 66%
rename from src/accl/graph/base/push_engine.hh
rename to src/accl/graph/base/base_push_engine.hh
index fbb7d6915a..591f4ab734 100644
--- a/src/accl/graph/base/push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,67 +31,27 @@
 
 #include <queue>
 
-#include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
-#include "params/PushEngine.hh"
+#include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class PushEngine : public ClockedObject
+class BasePushEngine : public ClockedObject
 {
   private:
 
-    class PushRespPort : public ResponsePort
+    class MemPort : public RequestPort
     {
       private:
-        PushEngine* owner;
-
-      public:
-        PushRespPort(const std::string& name, PushEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class PushReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
+        BasePushEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
       public:
-        PushReqPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class PushMemPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        PushMemPort(const std::string& name, PushEngine* owner):
+        MemPort(const std::string& name, PushEngine* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -106,13 +66,9 @@ class PushEngine : public ClockedObject
 
     virtual void startup() override;
 
-    System* const system;
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
-    PushReqPort reqPort;
-    PushRespPort respPort;
-
-    PushMemPort memPort;
+    MemPort memPort;
 
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
@@ -128,9 +84,6 @@ class PushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
     bool handleUpdate(PacketPtr pkt);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
@@ -144,11 +97,14 @@ class PushEngine : public ClockedObject
 
   public:
 
-    PushEngine(const PushEngineParams &params);
+    BasePushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
+
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index b6e136dda5..923c1a2f38 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,7 +28,7 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
+
 from m5.objects.WLEngine import WLEngine
 from m5.objects.PushEngine import PushEngine
 from m5.objects.ApplyEngine import ApplyEngine
diff --git a/src/accl/graph/base/PushEngine.py b/src/accl/graph/sega/PushEngine.py
similarity index 83%
rename from src/accl/graph/base/PushEngine.py
rename to src/accl/graph/sega/PushEngine.py
index 7fef165169..fa9d921a26 100644
--- a/src/accl/graph/base/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -29,9 +29,13 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class PushEngine(ClockedObject):
-    type = 'PushEngine'
-    cxx_header = "accl/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
+from m5.objects.WLEngine import WLEngine
+from m5.objects.PushEngine import PushEngine
+from m5.objects.ApplyEngine import ApplyEngine
 
-    memPort  = RequestPort("Port to communicate with the memory")
+class MPU(ClockedObject):
+    type = 'MPU'
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = 'gem5::MPU'
+
+    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
new file mode 100644
index 0000000000..e69de29bb2
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
new file mode 100644
index 0000000000..e69de29bb2

From 3f798dfd17a1ec8087fcdd6c904ae1e8777c91c1 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 11:34:28 -0800
Subject: [PATCH 038/287] Adding RequestorID

---
 src/accl/graph/base/base_apply_engine.cc | 13 +++++++++++++
 src/accl/graph/base/base_apply_engine.hh |  3 +++
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++++
 src/accl/graph/base/base_wl_engine.hh    |  4 ++++
 4 files changed, 33 insertions(+)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index c88d14a2c2..111ea16f2e 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
@@ -54,6 +55,18 @@ BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseApplyEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseApplyEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index c2d2f26387..3304e58a92 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -116,6 +116,9 @@ class BaseApplyEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7261069c17..dec37636ba 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,6 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
+    requestorId(0),
     memPort(name() + ".memPort", this),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
@@ -54,6 +55,18 @@ BaseWLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+RequestorID
+BaseWLEngine::getRequestorId()
+{
+    return requestorId;
+}
+
+void
+BaseWLEngine::setRequestorId(RequestorID requestorId)
+{
+    this->requestorId = requestorId;
+}
+
 void
 BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 2095a20f1b..a63d9b1ef7 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -97,6 +97,7 @@ class BaseWLEngine : public ClockedObject
         void recvReqRetry() override;
     };
 
+    RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
@@ -125,6 +126,9 @@ class BaseWLEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    RequestorID getRequestorId();
+    void setRequestorId(RequestorId requestorId);
 };
 
 }

From d8680eeef1505fb937c7e1ddc8f37681669f46e5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 13:01:19 -0800
Subject: [PATCH 039/287] Definining MPU interfaces.

---
 src/accl/graph/base/base_push_engine.cc |  35 +----
 src/accl/graph/base/base_push_engine.hh |  24 ----
 src/accl/graph/base/base_wl_engine.hh   |   1 +
 src/accl/graph/sega/mpu.cc              | 183 ++++++++++++++++++++++++
 src/accl/graph/sega/mpu.hh              | 134 +++++++++++++++++
 src/mem/packet.hh                       |   3 +
 6 files changed, 322 insertions(+), 58 deletions(-)
 create mode 100644 src/accl/graph/sega/mpu.cc

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 9fbc89221f..c4388cab4b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -49,11 +49,7 @@ BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(par
 Port &
 BasePushEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -104,36 +100,7 @@ BasePushEngine::startup()
 
 }
 
-bool
-BasePushEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
 
-void
-BasePushEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BasePushEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
 
 bool
 BasePushEngine::handleUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 591f4ab734..2265bb32db 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,34 +42,10 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
-
-    class MemPort : public RequestPort
-    {
-      private:
-        BasePushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, PushEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    MemPort memPort;
-
     std::queue<PacketPtr> vertexQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a63d9b1ef7..3a683bb6e4 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -105,6 +105,7 @@ class BaseWLEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
+    //FIXME: make handleWLUpdate public
     bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..c45ad78ef9
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+void
+MPU::startup()
+{
+    if (((int16_t) applyEngine->getRequestorId) == -1) {
+        applyEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) pushEngine->getRequestorId) == -1) {
+        pushEngine->setRequestorId(nextRequestorId++);
+    }
+    if (((int16_t) wlEngine->getRequestorId) == -1) {
+        wlEngine->setRequestorId(nextRequestorId++);
+    }
+}
+
+AddrRangeList
+MPU::MPURespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+Tick
+MPU::MPURespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::MPURespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::MPURespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::MPUReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::MPUReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+MPU::MPUMemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+void
+MPU::MPUMemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+AddrRangeList
+MPU::getAddrRanges()
+{
+    return memPort.getAddrRanges();
+}
+
+void
+MPU::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isUpdateWL()) {
+        panic("Functional requests should not be made to WL.")
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        memPort.recvFuctional(pkt);
+    }
+}
+
+bool
+MPU::handleMemReq(PacketPtr pkt)
+{
+    return memPort.recvTimingReq(pkt);
+}
+
+void
+MPU::handleMemResp(PacketPtr pkt)
+{
+    //TODO: Implement this;
+}
+
+bool
+MPU::recvWLNotif(WorkListItem wl)
+{
+    return applyEngine->recvWLUpdate(wl);
+}
+
+bool
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+}
+
+bool
+MPU::recvPushUpdate(PacketPtr pkt)
+{
+    // TODO: Implement this Mahyar
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index e69de29bb2..bc4ba5d53b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,134 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/MPU.hh"
+#include "sim/clocked_object.hh"
+
+class MPU : public ClockedObject
+{
+  private:
+    class MPURespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+
+      public:
+        MPURespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class MPUReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MPUReqPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class MPUMemPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        MemPort(const std::string& name, MPU* owner):
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    virtual void startup();
+
+    RequestorID nextRequestorId;
+
+    MPURespPort respPort;
+    MPUReqPort reqPort;
+    MPUMemPort memPort;
+
+    ApplyEngine* applyEngine;
+    PushEngine* pushEngine;
+    WLEngine* wlEngine;
+
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool recvWLNotif(WorkListItem wl);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvPushUpdate(PacketPtr pkt);
+
+  public:
+
+    MPU(const MPUParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+}
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 5332ee32a2..44c44d08a6 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -178,6 +178,7 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
+        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -267,6 +268,8 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
+    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
+
     Command
     responseCommand() const
     {

From 1b1bbac7eedbbf1dfc1f8a5d1495227c6a87e789 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 15:46:10 -0800
Subject: [PATCH 040/287] Adding changes to ApplyEngine and WLEngine

---
 src/accl/graph/base/base_apply_engine.hh | 28 ++++--------------------
 src/accl/graph/base/base_wl_engine.hh    | 26 +++++-----------------
 2 files changed, 9 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 3304e58a92..d603cb2713 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -73,31 +73,8 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseApplyEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, BaseApplyEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked(){ return _blocked;}
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     const RequestorID requestorId;
 
-    MemPort memPort;
-
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
 
@@ -106,11 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-
+    //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+
   public:
     BaseApplyEngine(const ApplyParams &apply);
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3a683bb6e4..0530c64c72 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -77,26 +77,6 @@ class BaseWLEngine : public ClockedObject
         sendPktRetry(false){}
     };
 
-    class MemPort : public RequestPort
-    {
-      private:
-        WLEngine *owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MemPort(const std::string& name, WLEngine* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        void recvReqRetry() override;
-    };
-
     RequestorID requestorId;
     MemPort memPort;
     WLQueue updateQueue;
@@ -113,6 +93,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
+   //FIXME: make void
     bool handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
@@ -121,8 +102,11 @@ class BaseWLEngine : public ClockedObject
        read + write
        Write edgelist loc in buffer
     */
+  protected:
+    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual void sendApplyReq(WorkListItem wl) = 0;
 
-   public:
+  public:
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,

From 64080f26149dd3295e452b1e842e2fef1ef8613c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 22:39:08 -0800
Subject: [PATCH 041/287] Finished restructured for ApplyE and WLE,
 pre-compiled

---
 src/accl/graph/base/BaseApplyEngine.py   |  9 +--
 src/accl/graph/base/SConscript           |  8 +-
 src/accl/graph/base/base_apply_engine.cc | 94 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 ++--
 src/accl/graph/base/base_wl_engine.cc    | 78 +++++---------------
 src/accl/graph/base/base_wl_engine.hh    | 17 ++---
 src/accl/graph/sega/SConscript           | 37 ++++++++++
 src/accl/graph/sega/apply_engine.cc      | 48 ++++++++++++
 src/accl/graph/sega/apply_engine.hh      | 54 ++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 50 +++++++++++++
 src/accl/graph/sega/wl_engine.hh         | 57 ++++++++++++++
 11 files changed, 321 insertions(+), 144 deletions(-)
 create mode 100644 src/accl/graph/sega/SConscript

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 80aa430139..23fdfbb08a 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -29,10 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class Apply(ClockedObject):
-    type = 'Apply'
-    cxx_header = "accl/apply.hh"
-    cxx_class = 'gem5::Apply'
+class BaseApplyEngine(ClockedObject):
+    type = 'BaseApplyEngine'
+    cxx_header = "accl/base_apply_engine.hh"
+    cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index a881fa1e6e..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,11 +27,11 @@
 
 Import('*')
 
-SimObject('Apply.py')
+SimObject('BaseApplyEngine.py')
 SimObject('BasePushEngine.py')
-SimObject('WLEngine.py')
+SimObject('BaseWLEngine.py')
 
-Source('apply.cc')
+Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
-Source('wl_engine.cc')
+Source('base_wl_engine.cc')
 Source('util.cc')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 111ea16f2e..805a7649b7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     applyReadQueue(params.applyQueueSize),
     applyWriteQueue(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
 Port &
 BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
         return SimObject::getPort(if_name, idx);
-    }
 }
 
 RequestorID
@@ -67,29 +62,6 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseApplyEngine::ApplyMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseApplyEngine::ApplyMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
-void
-BaseApplyEngine::ApplyMemPort::recvReqRetry()
-{
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
 bool BaseApplyEngine::handleWL(PacketPtr pkt){
     auto queue = applyReadQueue;
     if (queue.blocked()){
@@ -106,19 +78,19 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
     auto queue = applyReadQueue;
-    if (!memPort.blocked()){
-        PacketPtr pkt = queue.front();
-        if (queue.sendPktRetry && !queue.blocked()){
-                // respPort.trySendRetry();
-                queue.sendPktRetry = false;
-        }
-        // conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        memPort.sendPacket(memPkt);
+    // if (!memPort.blocked()){
+    PacketPtr pkt = queue.front();
+    // if (queue.sendPktRetry && !queue.blocked()){
+    //         // respPort.trySendRetry();
+    //         queue.sendPktRetry = false;
+    // }
+    // conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -157,26 +129,26 @@ BaseApplyEngine::processNextApplyEvent(){
         uint32_t temp_prop = wl.temp_prop;
 
         if (temp_prop != prop){
-            if (!memPort.blocked() && !reqPort.blocked()){
-                //update prop with temp_prop
-                if(prop < temp_prop){
-                    wl.prop = prop;
-                }else{
-                    wl.prop = temp_prop;
-                }
-                //write back the new worklist item to  memory
-                uint8_t* wList = workListToMemory(wl);
-                memcpy(data + request_offset, wList, sizeof(WorkListItem));
-                //Create memory write requests.
-                PacketPtr writePkt  =
-                getWritePacket(pkt->getAddr(), 64, data, requestorId);
-                memPort.sendPacket(writePkt);
-                reqPort.sendPacket(writePkt);
+            // if (!memPort.blocked() && !reqPort.blocked()){
+            //update prop with temp_prop
+            if(prop < temp_prop){
+                wl.prop = prop;
+            }else{
+                wl.prop = temp_prop;
+            }
+            //write back the new worklist item to  memory
+            uint8_t* wList = workListToMemory(wl);
+            memcpy(data + request_offset, wList, sizeof(WorkListItem));
+            //Create memory write requests.
+            PacketPtr writePkt  =
+            getWritePacket(pkt->getAddr(), 64, data, requestorId);
+            if (parent.sendMemReq(writePkt) &&
+                parent.recvApplyNotif(WorkListItem.prop,
+                                      WorkListItem.degree,
+                                      WorkListItem.edgeIndex)){
                 queue.pop();
-                if (queue.sendPktRetry && !queue.blocked()){
-                    // memPort.trySendRetry();
-                    queue.sendPktRetry = false;
-                }
+                // memPort.trySendRetry();
+                // queue.sendPktRetry = false;
             }
         }else{
             queue.applyQueue.pop();
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index d603cb2713..27d906f060 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_BASEAPPLY_HH__
+#define __ACCL_BASEAPPLY_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -83,13 +83,14 @@ class BaseApplyEngine : public ClockedObject
     bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
-    //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const ApplyParams &apply);
@@ -103,4 +104,4 @@ class BaseApplyEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_APPLY_HH__
+#endif // __BASEACCL_APPLY_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index dec37636ba..4af6f5e326 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     updateQueue(params.wlQueueSize),
     responseQueue(params.wlQueueSize),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
@@ -48,11 +47,7 @@ BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
 Port &
 BaseWLEngine::getPort(const std::string &if_name, PortID idx)
 {
-    if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
+    return SimObject::getPort(if_name, idx);
 }
 
 RequestorID
@@ -67,31 +62,6 @@ BaseWLEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BaseWLEngine::WLMemPort::sendPacket(PacketPtr pkt)
-{
-    if (!sendTimingReq(pkt)) {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-void
-BaseWLEngine::WLMemPort::recvReqRetry()
-{
-    // We should have a blocked packet if this function is called.
-    assert(_blocked && blockedPacket != nullptr);
-    _blocked = false;
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-}
-
-bool
-BaseWLEngine::WLMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 bool
 BaseWLEngine::handleWLUpdate(PacketPtr pkt){
     auto queue = updateQueue;
@@ -109,20 +79,16 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt){
 
 void BaseWLEngine::processNextWLReadEvent(){
     auto queue = updateQueue;
-    while (!queue.empty()){ //create a map instead of front
-        PacketPtr pkt = queue.front();
-        /// conver to ReadReq
-        Addr req_addr = (pkt->getAddr() / 64) * 64;
-        int req_offset = (pkt->getAddr()) % 64;
-        RequestPtr request =
-            std::make_shared<Request>(req_addr, 64, 0 ,0);
-        PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-        requestOffset[request] = req_offset;
-        if (!memPort.blocked()){
-            queue.pop();
-            memPort.sendPacket(memPkt);
-            break;
-        }
+    PacketPtr pkt = queue.front();
+    /// conver to ReadReq
+    Addr req_addr = (pkt->getAddr() / 64) * 64;
+    int req_offset = (pkt->getAddr()) % 64;
+    RequestPtr request =
+        std::make_shared<Request>(req_addr, 64, 0 ,0);
+    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
+    requestOffset[request] = req_offset;
+    if (parent.sendMemReq()){
+        queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
         schedule(nextWLReadEvent, nextCycle());
@@ -150,7 +116,6 @@ void
 BaseWLEngine::processNextWLReduceEvent(){
     auto queue = responseQueue;
     auto updateQ = updateQueue;
-    auto applyPort = reqPort;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
@@ -164,17 +129,16 @@ BaseWLEngine::processNextWLReduceEvent(){
         if(*value < temp_prop){
             temp_prop = *value;
         }
-        if (!memPort.blocked() && !applyPort.blocked()){
-            wl.temp_prop = temp_prop;
-            uint8_t* wlItem = workListToMemory(wl);
-            memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            memPort.sendPacket(writePkt);
-            applyPort.sendPacket(writePkt);
+        // if (!memPort.blocked() && !applyPort.blocked()){
+        wl.temp_prop = temp_prop;
+        uint8_t* wlItem = workListToMemory(wl);
+        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (parent.sendMemReq(writePkt) &&
+            parent.sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
-                // memPort.trySendRetry();
                 queue.sendPktRetry = false;
             }
             updateQ.pop();
@@ -187,12 +151,10 @@ BaseWLEngine::processNextWLReduceEvent(){
     else{
         queue.pop();
         if (!queue.blocked() && queue.sendPktRetry){
-            // memPort.trySendRetry();
             queue.sendPktRetry = false;
         }
         updateQ.pop();
         if (!updateQ.blocked() & updateQ.sendPktRetry){
-            // respPort.trySendRetry();
             updateQ.sendPktRetry = false;
         }
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 0530c64c72..1d0f3e33c1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLE_HH__
-#define __ACCL_WLE_HH__
+#ifndef __ACCL_BASEWLENGINE_HH__
+#define __ACCL_BASEWLENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -78,23 +78,19 @@ class BaseWLEngine : public ClockedObject
     };
 
     RequestorID requestorId;
-    MemPort memPort;
     WLQueue updateQueue;
     WLQueue responseQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     //Events
-    //FIXME: make handleWLUpdate public
-    bool handleWLUpdate(PacketPtr pkt);
     EventFunctionWrapper nextWLReadEvent;
     void processNextWLReadEvent();
     /* Syncronously checked
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-   //FIXME: make void
-    bool handleMemResp(PacketPtr resp);
+    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -103,8 +99,8 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual void sendMemReq(PacketPtr pkt) = 0;
-    virtual void sendApplyReq(WorkListItem wl) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool sendWLNotif(WorkListItem wl) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -114,8 +110,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorId requestorId);
+    bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_WLE_HH__
+#endif // __ACCL_BASEWLENGINE_HH__
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
new file mode 100644
index 0000000000..79afe3b7d0
--- /dev/null
+++ b/src/accl/graph/sega/SConscript
@@ -0,0 +1,37 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('WLEngine.py')
+
+Source('apply_engine.cc')
+Source('mpu.cc')
+Source('push_engine.cc')
+Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index e69de29bb2..41a568bd27 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/apply_engine.hh"
+
+namespace gem5{
+
+ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+    BaseApplyEngine(params)
+{}
+
+virtual bool
+ApplyEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+virtual bool
+ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+    mpu->recvApplyNotif(prop, degree, edgeIndex);
+
+}
+
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index e69de29bb2..fd2bca008f 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -0,0 +1,54 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_APPLY_HH__
+#define __ACCL_APPLY_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_apply_engine.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/ApplyEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+
+namespace gem5
+{
+
+class ApplyEngine : public BaseApplyEngine
+{
+  private:
+    MPU mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+  public:
+    ApplyEngine(const ApplyEngineParams &params);
+}
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e69de29bb2..9608d0cbc4 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+
+#include <string>
+
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params)
+{}
+
+virtual bool
+WLEngine::sendMemReq(PacketPtr pkt){
+    return mpu->handleMemReq(pkt);
+}
+
+// FIXME: handle the case where Apply queue is full
+virtual bool
+WLEngine::sendWLNotif(WorkListItem wl){
+    mpu->recvWLNotif(wl);
+    return true;
+}
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index e69de29bb2..eee6b1f22f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_WLENGINE_HH__
+#define __ACCL_WLENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/port.hh"
+#include "mem/packet.hh"
+#include "params/WLEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+
+namespace gem5
+{
+
+class WLEngine : public BaseWorkListEngine
+{
+  private:
+    MPU* mpu;
+  protected:
+    virtual bool sendMemReq(PacketPtr pkt);
+    virtual bool sendWLNotif(WorkListItem wl);
+  public:
+    WLEngine(const WLEngineParams &params);
+}
\ No newline at end of file

From c6ce909250341eed9d6fe814c45eb402dad0d3b7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:31:49 -0800
Subject: [PATCH 042/287] Finished restructure for PushEngine. Pre-compile.

---
 src/accl/graph/base/base_push_engine.cc | 30 +++++--------
 src/accl/graph/base/base_push_engine.hh | 19 +++++++-
 src/accl/graph/sega/mpu.cc              | 29 +++++++++++--
 src/accl/graph/sega/mpu.hh              |  2 +-
 src/accl/graph/sega/push_engine.cc      | 58 +++++++++++++++++++++++++
 src/accl/graph/sega/push_engine.hh      | 55 +++++++++++++++++++++++
 6 files changed, 169 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index c4388cab4b..6871154276 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,7 +33,8 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) : ClockedObject(params),
+BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+    ClockedObject(params),
     requestorId(0),
     memPort(name() + ".memPort", this),
     // vertexQueueSize(params.vertex_queue_size),
@@ -103,7 +104,8 @@ BasePushEngine::startup()
 
 
 bool
-BasePushEngine::handleUpdate(PacketPtr pkt)
+BasePushEngine::recvApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edge_index)
 {
     //FIXME: There should be a check if the queues are full.
     // if (vertexQueueLen < vertexQueueSize) {
@@ -115,7 +117,7 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
     //     return true;
     // }
     // return false;
-    vertexQueue.push(pkt);
+    notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
     }
@@ -125,21 +127,15 @@ BasePushEngine::handleUpdate(PacketPtr pkt)
 void
 BasePushEngine::processNextReceiveEvent()
 {
-    PacketPtr updatePkt = vertexQueue.front();
-    uint8_t* data = updatePkt->getPtr<uint8_t>();
-
-    // data: (edge_index: 32 bits, degree: 32 bits, value: 32 bits)
-    uint32_t edge_index = *((uint32_t *)data);
-    uint32_t degree = *((uint32_t *)(data + 4));
-    uint32_t value = *((uint32_t *)(data + 8));
+    ApplyNotif notif = notifQueue.front();
 
     std::vector<Addr> addr_queue;
     std::vector<Addr> offset_queue;
     std::vector<int> num_edge_queue;
 
-    for (uint32_t index = 0; index < degree; index++) {
+    for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
@@ -164,10 +160,10 @@ BasePushEngine::processNextReceiveEvent()
         memReqQueue.push(pkt);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = value;
+        reqValueMap[pkt->req] = notif.prop;
     }
 
-    vertexQueue.pop();
+    notifQueue.pop();
 
     if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
@@ -178,8 +174,7 @@ void
 BasePushEngine::processNextReadEvent()
 {
     PacketPtr pkt = memReqQueue.front();
-    if (!memPort.blocked()) {
-        memPort.sendPacket(pkt);
+    if (!sendMemReq(pkt)) {
         memReqQueue.pop();
     }
 
@@ -226,8 +221,7 @@ void
 BasePushEngine::processNextSendEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
+    if (!sendPushUpdate(pkt)) {
         updateQueue.pop();
     }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 2265bb32db..63ad3a6652 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -42,11 +42,22 @@ namespace gem5
 class BasePushEngine : public ClockedObject
 {
   private:
+
+    struct ApplyNotif {
+        uint32_t prop;
+        uint32_t degree;
+        uint32_t edgeIndex;
+
+        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
+        prop(prop), degree(degree), edgeIndex(edge_index)
+        {}
+    };
+
     virtual void startup() override;
 
     RequestorID requestorId;
 
-    std::queue<PacketPtr> vertexQueue;
+    std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
 
@@ -60,7 +71,7 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool handleUpdate(PacketPtr pkt);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
@@ -71,6 +82,10 @@ class BasePushEngine : public ClockedObject
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+
   public:
 
     BasePushEngine(const PushEngineParams &params);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c45ad78ef9..09ab23a835 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -161,7 +161,16 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    //TODO: Implement this;
+    RequestorID requestorId = pkt->requestorId();
+    if (applyEngine->getRequestorId() == requestorId) {
+        applyEngine->handleMemResp(pkt);
+    } else if (pushEngine->getRequestorId() == requestorId) {
+        pushEngine->handleMemResp(pkt);
+    } else if (wlEngine->getRequestorId() == requestorId) {
+        wlEngine->handleMemResp(pkt);
+    } else {
+        panic("Received a response with an unknown requestorId.");
+    }
 }
 
 bool
@@ -173,11 +182,25 @@ MPU::recvWLNotif(WorkListItem wl)
 bool
 MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edgeIndex);
+    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
-    // TODO: Implement this Mahyar
+    Addr addr = pkt->getAddr();
+    for (auto addr_range: memPort.getAddrRangeList()) {
+        if (addr_range.contains(addr)) {
+            if (!memPort.sendPacket(pkt)) {
+                return false;
+            }
+            return true;
+        }
+    }
+
+    if (!reqPort.sendPacket(pkt)) {
+        return false;
+    }
+    return true;
+
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index bc4ba5d53b..93d1dd8bb3 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -120,7 +120,7 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e69de29bb2..e43512c6f4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngine &params) :
+    BasePushEngine(params),
+    owner(params.mpu)
+{
+}
+
+Port &
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    return SimObject::getPort(if_name, idx);
+}
+
+bool
+PushEngine::sendMemReq(PacketPtr)
+{
+    return owner->handleMemReq(pkt);
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    return owner->recvPushUpdate(pkt);
+}
+
+}
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e69de29bb2..54ef72d5f9 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -0,0 +1,55 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+
+namespace gem5
+{
+class PushEngine : public BasePushEngine
+{
+  private:
+    MPU* owner;
+
+  protected:
+    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt);
+
+  public:
+    PushEngine(const PushEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file

From 8a2dae86375bd48db32f494343df2fc9d5d35816 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 20 Feb 2022 23:51:02 -0800
Subject: [PATCH 043/287] Debugging.

---
 src/accl/graph/base/base_apply_engine.cc | 31 +++++++++---------------
 src/accl/graph/base/base_apply_engine.hh | 13 +++++-----
 src/accl/graph/base/base_push_engine.hh  |  9 ++++---
 src/accl/graph/base/base_wl_engine.cc    |  6 ++---
 src/accl/graph/base/base_wl_engine.hh    |  9 ++++---
 src/accl/graph/base/util.cc              |  2 +-
 src/accl/graph/sega/ApplyEngine.py       |  7 ++----
 src/accl/graph/sega/MPU.py               |  6 ++---
 src/accl/graph/sega/PushEngine.py        | 16 +++++-------
 src/accl/graph/sega/SConscript           |  1 +
 src/accl/graph/sega/WLEngine.py          |  7 ++----
 src/accl/graph/sega/apply_engine.cc      |  6 ++---
 src/accl/graph/sega/apply_engine.hh      | 10 +++++---
 src/accl/graph/sega/mpu.cc               | 11 ++++++---
 src/accl/graph/sega/mpu.hh               |  5 ++++
 src/accl/graph/sega/wl_engine.hh         |  9 ++++---
 16 files changed, 75 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 805a7649b7..301f5931bf 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base_apply_engine.hh"
+#include "accl/graph/base/base_apply_engine.hh"
 
 #include <string>
 
@@ -90,7 +90,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq(memPkt)){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -98,22 +98,13 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     }
 }
 
-bool
+void
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = applyWriteQueue;
-
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else
-            queue.push(pkt);
-
-        if(!nextApplyEvent.scheduled()){
-            schedule(nextApplyEvent, nextCycle());
-        }
-        return true;
-    return true;
+    // FIXME: change the event, remove the retry parts
+    if(!nextApplyEvent.scheduled()){
+        schedule(nextApplyEvent, nextCycle());
+    }
 }
 
 void
@@ -142,10 +133,10 @@ BaseApplyEngine::processNextApplyEvent(){
             //Create memory write requests.
             PacketPtr writePkt  =
             getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (parent.sendMemReq(writePkt) &&
-                parent.recvApplyNotif(WorkListItem.prop,
-                                      WorkListItem.degree,
-                                      WorkListItem.edgeIndex)){
+            if (sendMemReq(writePkt) &&
+                recvApplyNotif(wl.prop,
+                                wl.degree,
+                                wl.edgeIndex)){
                 queue.pop();
                 // memPort.trySendRetry();
                 // queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 27d906f060..56b43cfb7b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -26,14 +26,15 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEAPPLY_HH__
-#define __ACCL_BASEAPPLY_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/port.hh"
@@ -73,7 +74,7 @@ class BaseApplyEngine : public ClockedObject
         {}
     };
 
-    const RequestorID requestorId;
+    RequestorID requestorId;
 
     ApplyQueue applyReadQueue;
     ApplyQueue applyWriteQueue;
@@ -93,15 +94,15 @@ class BaseApplyEngine : public ClockedObject
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
-    BaseApplyEngine(const ApplyParams &apply);
+    BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 };
 
 }
 
-#endif // __BASEACCL_APPLY_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 63ad3a6652..873cb26b3d 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_PUSH_ENGINE_HH__
-#define __ACCL_PUSH_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
 
 #include <queue>
 
 #include "mem/port.hh"
+#include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 #include "sim/clocked_object.hh"
@@ -94,10 +95,10 @@ class BasePushEngine : public ClockedObject
                 PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
 
 };
 
 }
 
-#endif // __ACCL_PUSH_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 4af6f5e326..b863b38e19 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -87,7 +87,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (parent.sendMemReq()){
+    if (sendMemReq()){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -135,8 +135,8 @@ BaseWLEngine::processNextWLReduceEvent(){
         memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (parent.sendMemReq(writePkt) &&
-            parent.sendWLNotif(writePkt)) {
+        if (sendMemReq(writePkt) &&
+            sendWLNotif(writePkt)) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 1d0f3e33c1..3d807d8b06 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -26,12 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_BASEWLENGINE_HH__
-#define __ACCL_BASEWLENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
@@ -109,10 +110,10 @@ class BaseWLEngine : public ClockedObject
                   PortID idx=InvalidPortID) override;
 
     RequestorID getRequestorId();
-    void setRequestorId(RequestorId requestorId);
+    void setRequestorId(RequestorID requestorId);
     bool handleWLUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_BASEWLENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 40a1fc761b..0baa374714 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/util.hh"
+#include "accl/graph/base/util.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 0d03e71e54..bb43836ff7 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseApplyEngine import BaseApplyEngine
 
 class ApplyEngine(BaseApplyEngine):
     type = 'ApplyEngine'
     cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this ApplyEngine")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 923c1a2f38..046dfaf4e8 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -29,9 +29,9 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+# from m5.objects.WLEngine import WLEngine
+# from m5.objects.PushEngine import PushEngine
+# from m5.objects.ApplyEngine import ApplyEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index fa9d921a26..eb0eed18ab 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,15 +27,11 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BasePushEngine import BasePushEngine
 
-from m5.objects.WLEngine import WLEngine
-from m5.objects.PushEngine import PushEngine
-from m5.objects.ApplyEngine import ApplyEngine
+class PushEngine(BasePushEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
 
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    mpu = Param.MPU(Parent, "The MPU object than owns this PushEngine.")
+    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 79afe3b7d0..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -29,6 +29,7 @@ Import('*')
 
 SimObject('ApplyEngine.py')
 SimObject('MPU.py')
+SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a8f3bd20ea..12fbcf9b4f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -25,16 +25,13 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from build.NULL.python.m5.proxy import Parent
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-# FIXME: update these to correct files
 from m5.objects.BaseWLEngine import BaseWLEngine
 
 class WLEngine(BaseWLEngine):
     type = 'WLEngine'
     cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::MPU'
+    cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent, "MPU object that owns this WLEngine")
\ No newline at end of file
+    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 41a568bd27..64ae71e290 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -30,16 +30,16 @@
 
 namespace gem5{
 
-ApplyEngine:ApplyEngine(const BaseApplyEngine &params):
+ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
     BaseApplyEngine(params)
 {}
 
-virtual bool
+bool
 ApplyEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
-virtual bool
+bool
 ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index fd2bca008f..855ebbd8b0 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_APPLY_HH__
-#define __ACCL_APPLY_HH__
+#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -45,10 +45,14 @@ namespace gem5
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-    MPU mpu;
+    MPU* mpu;
   protected:
     virtual bool sendMemReq(PacketPtr pkt);
     virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
   public:
     ApplyEngine(const ApplyEngineParams &params);
+};
+
 }
+
+#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 09ab23a835..27f7c8e314 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -28,16 +28,19 @@
 
 #include "accl/graph/sega/mpu.hh"
 
+namespace gem5
+{
+
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId) == -1) {
+    if (((int16_t) applyEngine->getRequestorId()) == -1) {
         applyEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) pushEngine->getRequestorId) == -1) {
+    if (((int16_t) pushEngine->getRequestorId()) == -1) {
         pushEngine->setRequestorId(nextRequestorId++);
     }
-    if (((int16_t) wlEngine->getRequestorId) == -1) {
+    if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
 }
@@ -204,3 +207,5 @@ MPU::recvPushUpdate(PacketPtr pkt)
     return true;
 
 }
+
+}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 93d1dd8bb3..b37821c200 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -39,6 +39,9 @@
 #include "params/MPU.hh"
 #include "sim/clocked_object.hh"
 
+namespace gem5
+{
+
 class MPU : public ClockedObject
 {
   private:
@@ -129,6 +132,8 @@ class MPU : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+};
+
 }
 
 #endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index eee6b1f22f..938128e05f 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_WLENGINE_HH__
-#define __ACCL_WLENGINE_HH__
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -54,4 +54,7 @@ class WLEngine : public BaseWorkListEngine
     virtual bool sendWLNotif(WorkListItem wl);
   public:
     WLEngine(const WLEngineParams &params);
-}
\ No newline at end of file
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file

From c57c564598e55741ed4c33194e7e0c2750efe9c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:40:15 -0800
Subject: [PATCH 044/287] Lots of debugging.

---
 src/accl/graph/base/BaseApplyEngine.py   |   2 +-
 src/accl/graph/base/BasePushEngine.py    |   1 -
 src/accl/graph/base/BaseWLEngine.py      |   4 +-
 src/accl/graph/base/base_apply_engine.cc | 105 ++++++++++------------
 src/accl/graph/base/base_apply_engine.hh |  40 ++-------
 src/accl/graph/base/base_push_engine.cc  |  45 +---------
 src/accl/graph/base/base_push_engine.hh  |  10 +--
 src/accl/graph/base/base_wl_engine.cc    |   6 +-
 src/accl/graph/base/base_wl_engine.hh    |   6 +-
 src/accl/graph/sega/MPU.py               |   6 +-
 src/accl/graph/sega/apply_engine.cc      |  10 ++-
 src/accl/graph/sega/apply_engine.hh      |   9 +-
 src/accl/graph/sega/mpu.cc               | 107 +++++++++++++++++++----
 src/accl/graph/sega/mpu.hh               |  20 ++---
 src/accl/graph/sega/push_engine.cc       |  11 +--
 src/accl/graph/sega/push_engine.hh       |  12 ++-
 src/accl/graph/sega/wl_engine.cc         |  19 ++--
 src/accl/graph/sega/wl_engine.hh         |  13 ++-
 src/mem/packet.hh                        |   3 -
 19 files changed, 217 insertions(+), 212 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 23fdfbb08a..45d94b3fd2 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/base_apply_engine.hh"
+    cxx_header = "accl/graph/base/base_apply_engine.hh"
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 6ed5d25978..891221c06d 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,4 +34,3 @@ class BasePushEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
 
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7384e876ef..3ecf030138 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -31,8 +31,8 @@
 
 class BaseWLEngine(ClockedObject):
     type = 'BaseWLEngine'
-    cxx_header = "accl/base_wl_engine.hh"
+    cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Memory side port, sends requests")
+
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 301f5931bf..731cd5c345 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -38,8 +38,7 @@ namespace gem5
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     ClockedObject(params),
     requestorId(-1),
-    applyReadQueue(params.applyQueueSize),
-    applyWriteQueue(params.applyQueueSize),
+    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
@@ -62,14 +61,14 @@ BaseApplyEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-bool BaseApplyEngine::handleWL(PacketPtr pkt){
-    auto queue = applyReadQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else{
-        queue.push(pkt);
-    }
+bool BaseApplyEngine::recvWLNotif(Addr addr){
+    // TODO: Investigate the situation where the queue is full.
+    // if (applyReadQueue.size() == queueSize){
+    //     //  applyReadQueue.sendPktRetry = true;
+    //     return true;
+    // } else{
+    applyReadQueue.push(addr);
+    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
@@ -77,78 +76,64 @@ bool BaseApplyEngine::handleWL(PacketPtr pkt){
 }
 
 void BaseApplyEngine::processNextApplyCheckEvent(){
-    auto queue = applyReadQueue;
-    // if (!memPort.blocked()){
-    PacketPtr pkt = queue.front();
-    // if (queue.sendPktRetry && !queue.blocked()){
-    //         // respPort.trySendRetry();
-    //         queue.sendPktRetry = false;
-    // }
-    // conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
+    Addr addr = applyReadQueue.front();
+    Addr req_addr = (addr / 64) * 64;
+    int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
     if (sendMemReq(memPkt)){
-        queue.pop();
+        applyReadQueue.pop();
     }
-    if (!queue.empty() &&  !nextApplyCheckEvent.scheduled()){
+    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
 }
 
-void
+bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
     // FIXME: change the event, remove the retry parts
+    applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
+    return true;
 }
 
 void
 BaseApplyEngine::processNextApplyEvent(){
-    auto queue = applyWriteQueue;
-        PacketPtr pkt = queue.front();
-        uint8_t* data = pkt->getPtr<uint8_t>();
+    PacketPtr pkt = applyWriteQueue.front();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-        RequestPtr request = pkt->req;
-        int request_offset = requestOffset[request];
-        WorkListItem wl = memoryToWorkList(data + request_offset);
-        uint32_t prop = wl.prop;
-        uint32_t temp_prop = wl.temp_prop;
+    RequestPtr request = pkt->req;
+    int request_offset = requestOffset[request];
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    uint32_t prop = wl.prop;
+    uint32_t temp_prop = wl.temp_prop;
 
-        if (temp_prop != prop){
-            // if (!memPort.blocked() && !reqPort.blocked()){
-            //update prop with temp_prop
-            if(prop < temp_prop){
-                wl.prop = prop;
-            }else{
-                wl.prop = temp_prop;
-            }
-            //write back the new worklist item to  memory
-            uint8_t* wList = workListToMemory(wl);
-            memcpy(data + request_offset, wList, sizeof(WorkListItem));
-            //Create memory write requests.
-            PacketPtr writePkt  =
-            getWritePacket(pkt->getAddr(), 64, data, requestorId);
-            if (sendMemReq(writePkt) &&
-                recvApplyNotif(wl.prop,
-                                wl.degree,
-                                wl.edgeIndex)){
-                queue.pop();
-                // memPort.trySendRetry();
-                // queue.sendPktRetry = false;
-            }
-        }else{
-            queue.applyQueue.pop();
-            if (queue.sendPktRetry && !queue.blocked()){
-                // memPort.trySendRetry();
-                queue.sendPktRetry = false;
-            }
+    if (temp_prop != prop) {
+        // TODO: instead of min add a Reduce function.
+        //update prop with temp_prop
+        if(prop < temp_prop) {
+            wl.prop = prop;
+        }else {
+            wl.prop = temp_prop;
+        }
+        //write back the new worklist item to  memory
+        uint8_t* wList = workListToMemory(wl);
+        memcpy(data + request_offset, wList, sizeof(WorkListItem));
+        //Create memory write requests.
+        PacketPtr writePkt  =
+        getWritePacket(pkt->getAddr(), 64, data, requestorId);
+        if (sendMemReq(writePkt) &&
+            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+            applyWriteQueue.pop();
         }
-    if(!queue.empty() && !nextApplyEvent.scheduled()){
+    }else {
+        applyWriteQueue.pop();
+    }
+    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 56b43cfb7b..b7c0db90cb 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,53 +45,24 @@ namespace gem5
 class BaseApplyEngine : public ClockedObject
 {
   private:
-    //FIXME: Remove queue defenition from here.
-    struct ApplyQueue{
-        std::queue<PacketPtr> applyQueue;
-        const uint32_t queueSize;
-        bool sendPktRetry;
-
-        bool blocked(){
-            return (applyQueue.size() == queueSize);
-        }
-        bool empty(){
-            return applyQueue.empty();
-        }
-        void push(PacketPtr pkt){
-            applyQueue.push(pkt);
-        }
-
-        void pop(){
-            applyQueue.pop();
-        }
-
-        PacketPtr front(){
-            return applyQueue.front();
-        }
-
-        ApplyQueue(uint32_t qSize):
-          queueSize(qSize)
-        {}
-    };
 
     RequestorID requestorId;
 
-    ApplyQueue applyReadQueue;
-    ApplyQueue applyWriteQueue;
+    std::queue<Addr> applyReadQueue;
+    std::queue<PacketPtr> applyWriteQueue;
+    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
-    bool handleWL(PacketPtr pkt);
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    void handleMemResp(PacketPtr resp);
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
     BaseApplyEngine(const BaseApplyEngineParams &apply);
@@ -101,6 +72,9 @@ class BaseApplyEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
+    bool recvWLNotif(Addr addr);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6871154276..d93cbdf8da 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -33,10 +33,9 @@
 namespace gem5
 {
 
-BasePushEngine::BasePushEngine(const BasePushEngine &params) :
+BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(0),
-    memPort(name() + ".memPort", this),
+    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -65,44 +64,6 @@ BasePushEngine::setRequestorId(RequestorID requestorId)
     this->requestorId = requestorId;
 }
 
-void
-BasePushEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, requestorId);
-        memPort.sendFunctional(pkt);
-    }
-
-}
-
-
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
@@ -135,7 +96,7 @@ BasePushEngine::processNextReceiveEvent()
 
     for (uint32_t index = 0; index < notif.degree; index++) {
         // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edge_index + index) * sizeof(Edge);
+        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
         Addr req_addr = (edge_addr / 64) * 64;
         Addr req_offset = edge_addr % 64;
         if (addr_queue.size()) {
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 873cb26b3d..c723932975 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -54,8 +54,6 @@ class BasePushEngine : public ClockedObject
         {}
     };
 
-    virtual void startup() override;
-
     RequestorID requestorId;
 
     std::queue<ApplyNotif> notifQueue;
@@ -72,24 +70,22 @@ class BasePushEngine : public ClockedObject
     // int updateQueueSize;
     // int updateQueueLen;
 
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     EventFunctionWrapper nextReceiveEvent;
     void processNextReceiveEvent();
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    bool handleMemResp(PacketPtr pkt);
     EventFunctionWrapper nextSendEvent;
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt) = 0;
+    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
 
   public:
 
-    BasePushEngine(const PushEngineParams &params);
+    BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -97,6 +93,8 @@ class BasePushEngine : public ClockedObject
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
 
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool handleMemResp(PacketPtr pkt);
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index b863b38e19..806ab4a6c3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -30,8 +30,6 @@
 
 #include <string>
 
-#include "accl/graph/base/util.hh"
-
 namespace gem5
 {
 
@@ -87,7 +85,7 @@ void BaseWLEngine::processNextWLReadEvent(){
         std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    if (sendMemReq()){
+    if (sendMemReq(memPkt)){
         queue.pop();
     }
     if(!queue.empty() && !nextWLReadEvent.scheduled()){
@@ -136,7 +134,7 @@ BaseWLEngine::processNextWLReduceEvent(){
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
         if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt)) {
+            sendWLNotif(writePkt->getAddr())) {
             queue.pop();
             if (!queue.blocked() && queue.sendPktRetry){
                 queue.sendPktRetry = false;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3d807d8b06..a2cab4c7e2 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -91,7 +91,7 @@ class BaseWLEngine : public ClockedObject
        If there are any active vertecies:
        create memory read packets + MPU::MPU::MemPortsendTimingReq
     */
-    void handleMemResp(PacketPtr resp);
+
     EventFunctionWrapper nextWLReduceEvent;
     void processNextWLReduceEvent();
     /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
@@ -101,7 +101,7 @@ class BaseWLEngine : public ClockedObject
     */
   protected:
     virtual bool sendMemReq(PacketPtr pkt) = 0;
-    virtual bool sendWLNotif(WorkListItem wl) = 0;
+    virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
     BaseWLEngine(const BaseWLEngineParams &params);
@@ -111,7 +111,9 @@ class BaseWLEngine : public ClockedObject
 
     RequestorID getRequestorId();
     void setRequestorId(RequestorID requestorId);
+
     bool handleWLUpdate(PacketPtr pkt);
+    bool handleMemResp(PacketPtr resp);
 };
 
 }
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 046dfaf4e8..68cfb3d42d 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,9 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    workListEngine = Param.WLEngine("WLEngine object to connect to "
+    work_list_engine = Param.WLEngine("WLEngine object to connect to "
                     "This MPU")
-    applyEngine = Param.ApplyEngine("ApplyEngine object to connect to "
+    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
                     "This MPU")
-    pushEngine = Param.PushEngine("PushEngine object to connect to "
+    push_engine = Param.PushEngine("PushEngine object to connect to "
                     "This MPU")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 64ae71e290..bc45850041 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,11 +27,13 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5{
 
-ApplyEngine::ApplyEngine(const BaseApplyEngine &params):
-    BaseApplyEngine(params)
+ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
+    BaseApplyEngine(params),
+    mpu(params.mpu)
 {}
 
 bool
@@ -40,9 +42,9 @@ ApplyEngine::sendMemReq(PacketPtr pkt){
 }
 
 bool
-ApplyEngine::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
     mpu->recvApplyNotif(prop, degree, edgeIndex);
-
+    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 855ebbd8b0..17e3280cb5 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,14 +42,21 @@
 namespace gem5
 {
 
+class MPU;
+
 class ApplyEngine : public BaseApplyEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+
   public:
+
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 27f7c8e314..4824bcd699 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -31,6 +31,31 @@
 namespace gem5
 {
 
+MPU::MPU(const MPUParams &params):
+    ClockedObject(params),
+    nextRequestorId(0),
+    respPort(name() + ".respPort", this),
+    reqPort(name() + ".reqPort", this),
+    memPort(name() + ".memPort", this),
+    applyEngine(params.apply_engine),
+    pushEngine(params.push_engine),
+    wlEngine(params.work_list_engine)
+{}
+
+Port&
+MPU::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "respPort") {
+        return respPort;
+    } else if (if_name == "reqPort") {
+        return reqPort;
+    } else if (if_name == "memPort") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
 void
 MPU::startup()
 {
@@ -43,6 +68,37 @@ MPU::startup()
     if (((int16_t) wlEngine->getRequestorId()) == -1) {
         wlEngine->setRequestorId(nextRequestorId++);
     }
+
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        memPort.sendFunctional(pkt);
+    }
 }
 
 AddrRangeList
@@ -54,7 +110,7 @@ MPU::MPURespPort::getAddrRanges() const
 bool
 MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
 {
-    return wlEngine->handleWLUpdate(pkt);
+    return owner->handleWLUpdate(pkt);
 }
 
 Tick
@@ -106,12 +162,6 @@ MPU::MPUReqPort::recvReqRetry()
     }
 }
 
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    return owner->handleMemResp(pkt);
-}
-
 void
 MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 {
@@ -124,6 +174,14 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
     }
 }
 
+bool
+MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    owner->handleMemResp(pkt);
+    return true;
+}
+
 void
 MPU::MPUMemPort::recvReqRetry()
 {
@@ -146,19 +204,21 @@ MPU::getAddrRanges()
 void
 MPU::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->isUpdateWL()) {
-        panic("Functional requests should not be made to WL.")
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
         //TODO: Might be a good idea to implement later.
         // wlEngine->recvFunctional(pkt);
     } else {
-        memPort.recvFuctional(pkt);
+        memPort.sendFunctional(pkt);
     }
 }
 
 bool
 MPU::handleMemReq(PacketPtr pkt)
 {
-    return memPort.recvTimingReq(pkt);
+    //TODO: Investigate sending true all the time
+    memPort.sendPacket(pkt);
+    return true;
 }
 
 void
@@ -177,33 +237,42 @@ MPU::handleMemResp(PacketPtr pkt)
 }
 
 bool
-MPU::recvWLNotif(WorkListItem wl)
+MPU::handleWLUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleWLUpdate(pkt);
+}
+
+bool
+MPU::recvWLNotif(Addr addr)
 {
-    return applyEngine->recvWLUpdate(wl);
+    return applyEngine->recvWLNotif(addr);
 }
 
 bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
 {
-    return pushEngine->recvApplyUpdate(prop, degree, edge_index);
+    return pushEngine->recvApplyNotif(prop, degree, edge_index);
 }
 
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRangeList()) {
+    for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (!memPort.sendPacket(pkt)) {
+            if (memPort.blocked()) {
                 return false;
+            } else {
+                memPort.sendPacket(pkt);
+                return true;
             }
-            return true;
         }
     }
 
-    if (!reqPort.sendPacket(pkt)) {
+    if (reqPort.blocked()) {
         return false;
     }
+    reqPort.sendPacket(pkt);
     return true;
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index b37821c200..be5139c0e0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/base/util.hh"
 #include "accl/graph/sega/apply_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
@@ -91,7 +90,7 @@ class MPU : public ClockedObject
         PacketPtr blockedPacket;
 
       public:
-        MemPort(const std::string& name, MPU* owner):
+        MPUMemPort(const std::string& name, MPU* owner):
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
@@ -119,21 +118,22 @@ class MPU : public ClockedObject
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
-    bool recvWLNotif(WorkListItem wl);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool recvPushUpdate(PacketPtr pkt);
-
   public:
 
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool handleMemReq(PacketPtr pkt);
+    void handleMemResp(PacketPtr pkt);
+
+    bool handleWLUpdate(PacketPtr pkt);
+    bool recvWLNotif(Addr addr);
+    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
+    bool recvPushUpdate(PacketPtr pkt);
 };
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e43512c6f4..922ae32ed2 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,13 +27,14 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngine &params) :
+PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    owner(params.mpu)
+    mpu(params.mpu)
 {
 }
 
@@ -44,15 +45,15 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 bool
-PushEngine::sendMemReq(PacketPtr)
+PushEngine::sendMemReq(PacketPtr pkt)
 {
-    return owner->handleMemReq(pkt);
+    return mpu->handleMemReq(pkt);
 }
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return owner->recvPushUpdate(pkt);
+    return mpu->recvPushUpdate(pkt);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 54ef72d5f9..e4bb83d2bc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,16 +30,20 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
 
 namespace gem5
 {
+
+class MPU;
+
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* owner;
+    MPU* mpu;
 
   protected:
-    virtual bool sendMemRequest(PacketPtr pkt);
+    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
@@ -48,8 +52,8 @@ class PushEngine : public BasePushEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-}
+};
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9608d0cbc4..40ec755969 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,24 +27,25 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
-#include <string>
-
+#include "accl/graph/sega/mpu.hh"
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params)
+    BaseWLEngine(params),
+    mpu(params.mpu)
 {}
 
-virtual bool
+bool
 WLEngine::sendMemReq(PacketPtr pkt){
     return mpu->handleMemReq(pkt);
 }
 
 // FIXME: handle the case where Apply queue is full
-virtual bool
-WLEngine::sendWLNotif(WorkListItem wl){
-    mpu->recvWLNotif(wl);
+bool
+WLEngine::sendWLNotif(Addr addr){
+    mpu->recvWLNotif(addr);
     return true;
-}
\ No newline at end of file
+}
+
+}
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 938128e05f..c5f49ff6a2 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,16 +45,23 @@
 namespace gem5
 {
 
-class WLEngine : public BaseWorkListEngine
+// class MPU;
+
+class WLEngine : public BaseWLEngine
 {
   private:
+
     MPU* mpu;
+
   protected:
+
     virtual bool sendMemReq(PacketPtr pkt);
-    virtual bool sendWLNotif(WorkListItem wl);
+    virtual bool sendWLNotif(Addr addr);
+
   public:
+
     WLEngine(const WLEngineParams &params);
 };
 
 }
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 44c44d08a6..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -178,7 +178,6 @@ class MemCmd
         IsPrint,        //!< Print state matching address (for debugging)
         IsFlush,        //!< Flush the address from caches
         FromCache,      //!< Request originated from a caching agent
-        UpdateWL,       // MPU Accelerator
         NUM_COMMAND_ATTRIBUTES
     };
 
@@ -268,8 +267,6 @@ class MemCmd
                 cmd == ReadCleanReq || cmd == ReadSharedReq);
     }
 
-    bool isUpdateWL() const     {return testCmdAttrib(updateWL);}
-
     Command
     responseCommand() const
     {

From 8967f89ddfe20c155706993789344c5eff701d3c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 14:59:31 -0800
Subject: [PATCH 045/287] Style fix.

---
 src/accl/graph/base/BaseApplyEngine.py | 2 +-
 src/accl/graph/base/BasePushEngine.py  | 1 -
 src/accl/graph/base/BaseWLEngine.py    | 1 -
 3 files changed, 1 insertion(+), 3 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index 45d94b3fd2..e48b425b01 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -31,7 +31,7 @@
 
 class BaseApplyEngine(ClockedObject):
     type = 'BaseApplyEngine'
-    cxx_header = "accl/graph/base/base_apply_engine.hh"
+    cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
 
     applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 891221c06d..793b0a7c92 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -33,4 +33,3 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 3ecf030138..473fd05313 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -35,4 +35,3 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
-

From fa48d321dd41debc82f39646adf23ad780ca05a7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:20:51 -0800
Subject: [PATCH 046/287] Adding PARAMS macro.

---
 src/accl/graph/base/SConscript           | 6 +++---
 src/accl/graph/base/base_apply_engine.hh | 2 ++
 src/accl/graph/base/base_push_engine.hh  | 2 ++
 src/accl/graph/base/base_wl_engine.hh    | 3 +++
 src/accl/graph/sega/SConscript           | 8 ++++----
 src/accl/graph/sega/apply_engine.hh      | 2 +-
 src/accl/graph/sega/mpu.hh               | 2 +-
 src/accl/graph/sega/push_engine.hh       | 1 +
 src/accl/graph/sega/wl_engine.hh         | 2 +-
 9 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..5e82a44971 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
+SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
+SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index b7c0db90cb..fbcf95c238 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -65,6 +65,8 @@ class BaseApplyEngine : public ClockedObject
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
+    PARAMS(BaseApplyEngine);
+
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index c723932975..446f6a1186 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -85,6 +85,8 @@ class BasePushEngine : public ClockedObject
 
   public:
 
+    PARAMS(BasePushEngine);
+
     BasePushEngine(const BasePushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a2cab4c7e2..4cb492914c 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -104,6 +104,9 @@ class BaseWLEngine : public ClockedObject
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
+
+    PARAMS(BaseWLEngine);
+
     BaseWLEngine(const BaseWLEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..793dacc2ef 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('MPU.py')
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
+SimObject('MPU.py', sim_objects=["MPU"])
+SimObject('PushEngine.py', sim_objects=["PushEngine"])
+SimObject('WLEngine.py', sim_objects=["WLEngine"])
 
 Source('apply_engine.cc')
 Source('mpu.cc')
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 17e3280cb5..c7d3073e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -56,7 +56,7 @@ class ApplyEngine : public BaseApplyEngine
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
-
+    PARAMS(ApplyEngine);
     ApplyEngine(const ApplyEngineParams &params);
 };
 
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index be5139c0e0..cf241c9063 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -119,7 +119,7 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt);
 
   public:
-
+    PARAMS(MPU);
     MPU(const MPUParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e4bb83d2bc..1a800e58f3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -47,6 +47,7 @@ class PushEngine : public BasePushEngine
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
+    PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
 
     Port& getPort(const std::string &if_name,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c5f49ff6a2..238ffbe724 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -59,7 +59,7 @@ class WLEngine : public BaseWLEngine
     virtual bool sendWLNotif(Addr addr);
 
   public:
-
+    PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
 };
 

From 9a5245c317917f60daf0eb400260ec5b11304f26 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 21 Feb 2022 15:33:13 -0800
Subject: [PATCH 047/287] First compilation after restructure.

---
 src/accl/graph/base/BaseApplyEngine.py | 1 +
 src/accl/graph/base/BasePushEngine.py  | 1 +
 src/accl/graph/base/BaseWLEngine.py    | 1 +
 src/accl/graph/base/SConscript         | 6 +++---
 src/accl/graph/sega/SConscript         | 8 ++++----
 5 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index e48b425b01..fdabefc732 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseApplyEngine(ClockedObject):
+    abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index 793b0a7c92..d30124a6a4 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BasePushEngine(ClockedObject):
+    abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 473fd05313..7dcacefd97 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -30,6 +30,7 @@
 from m5.objects.ClockedObject import ClockedObject
 
 class BaseWLEngine(ClockedObject):
+    abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 5e82a44971..cc55100064 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,9 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py', sim_objects=["BaseApplyEngine"])
-SimObject('BasePushEngine.py', sim_objects=["BasePushEngine"])
-SimObject('BaseWLEngine.py', sim_objects=["BaseWLEngine"])
+SimObject('BaseApplyEngine.py')
+SimObject('BasePushEngine.py')
+SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
 Source('base_push_engine.cc')
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 793dacc2ef..dc19ece06b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,10 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py', sim_objects=["ApplyEngine"])
-SimObject('MPU.py', sim_objects=["MPU"])
-SimObject('PushEngine.py', sim_objects=["PushEngine"])
-SimObject('WLEngine.py', sim_objects=["WLEngine"])
+SimObject('ApplyEngine.py')
+SimObject('MPU.py')
+SimObject('PushEngine.py')
+SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
 Source('mpu.cc')

From c3b4c743d4953d3648fca7dd384e0f8ed33006f2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 07:38:41 -0800
Subject: [PATCH 048/287] Adding config file for SEGA and missing ports.

---
 configs/accl/sega.py       | 34 ++++++++++++++++++++++++++++++++++
 src/accl/graph/sega/MPU.py | 10 +++++++---
 2 files changed, 41 insertions(+), 3 deletions(-)
 create mode 100644 configs/accl/sega.py

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
new file mode 100644
index 0000000000..288b1211e4
--- /dev/null
+++ b/configs/accl/sega.py
@@ -0,0 +1,34 @@
+import m5
+from m5.objects import *
+
+class PyMPU(MPU):
+    def __init__(self, clk_domain):
+        super().__init__()
+        self.clk_domain = clk_domain
+        self.apply_engine = ApplyEngine()
+        self.push_engine = PushEngine()
+        self.wl_engine = WLEngine()
+
+class SEGA(System):
+
+    def __init__(self):
+        super(SEGA, self).__init__()
+        # Set up the clock domain and the voltage domain
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mpu = PyMPU(self.clk_domain)
+        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mpu.memPort = self.mem_ctrl.port
+        self.mpu.reqPort = self.mpu.respPort
+
+
+system = SEGA()
+root = Root(full_system = False, system = system)
+
+m5.instantiate()
+
+exit_event = m5.simulate()
+print("Simulation finished!")
+exit()
\ No newline at end of file
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 68cfb3d42d..efd8dbc11f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,9 +38,13 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    work_list_engine = Param.WLEngine("WLEngine object to connect to "
+    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
                     "This MPU")
-    apply_engine = Param.ApplyEngine("ApplyEngine object to connect to "
+    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    push_engine = Param.PushEngine("PushEngine object to connect to "
+    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
                     "This MPU")
+
+    respPort = ResponsePort("Port to Receive updates from outside")
+    reqPort  = RequestPort("Port to send updates to the outside")
+    memPort  = RequestPort("Port to communicate with the memory")

From 7be5866c0171399e8d5ef6851290dd61e7ef6fc9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 12:22:14 -0800
Subject: [PATCH 049/287] Adding BaseEngine class and started pointer fix.

---
 src/accl/graph/base/BaseApplyEngine.py   |  4 +-
 src/accl/graph/base/BaseEngine.py        | 38 ++++++++++
 src/accl/graph/base/BasePushEngine.py    |  2 +
 src/accl/graph/base/BaseWLEngine.py      |  1 +
 src/accl/graph/base/base_apply_engine.cc | 22 +-----
 src/accl/graph/base/base_apply_engine.hh |  9 +--
 src/accl/graph/base/base_engine.cc       | 75 ++++++++++++++++++++
 src/accl/graph/base/base_engine.hh       | 90 ++++++++++++++++++++++++
 src/accl/graph/sega/ApplyEngine.py       |  2 +-
 9 files changed, 213 insertions(+), 30 deletions(-)
 create mode 100644 src/accl/graph/base/BaseEngine.py
 create mode 100644 src/accl/graph/base/base_engine.cc
 create mode 100644 src/accl/graph/base/base_engine.hh

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index fdabefc732..be849ed1af 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseApplyEngine(ClockedObject):
+class BaseApplyEngine(BaseEngine):
     abstract = True
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
new file mode 100644
index 0000000000..3eb5f0cbbc
--- /dev/null
+++ b/src/accl/graph/base/BaseEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseEngine(ClockedObject):
+    abstract = True
+    type = 'BaseEngine'
+    cxx_header = "accl/graph/base/base_engine.hh"
+    cxx_class = 'gem5::BaseEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index d30124a6a4..c52a65abf9 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -34,3 +34,5 @@ class BasePushEngine(ClockedObject):
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
+
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index 7dcacefd97..ec34b52005 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -36,3 +36,4 @@ class BaseWLEngine(ClockedObject):
     cxx_class = 'gem5::BaseWLEngine'
 
     wlQueueSize = Param.Unsigned(32, "Size of write queue")
+    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 731cd5c345..4fd53fb037 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -36,31 +36,12 @@ namespace gem5
 {
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
+    BaseEngine(params),
     queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-Port &
-BaseApplyEngine::getPort(const std::string &if_name, PortID idx)
-{
-        return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseApplyEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BaseApplyEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool BaseApplyEngine::recvWLNotif(Addr addr){
     // TODO: Investigate the situation where the queue is full.
     // if (applyReadQueue.size() == queueSize){
@@ -82,6 +63,7 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
+    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
     if (sendMemReq(memPkt)){
         applyReadQueue.pop();
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index fbcf95c238..f81f23428e 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
@@ -42,12 +43,10 @@
 namespace gem5
 {
 
-class BaseApplyEngine : public ClockedObject
+class BaseApplyEngine : public BaseEngine
 {
   private:
 
-    RequestorID requestorId;
-
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
     int queueSize;
@@ -61,7 +60,6 @@ class BaseApplyEngine : public ClockedObject
     void processNextApplyEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -72,9 +70,6 @@ class BaseApplyEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvWLNotif(Addr addr);
     bool handleMemResp(PacketPtr resp);
 };
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
new file mode 100644
index 0000000000..d53e2e683a
--- /dev/null
+++ b/src/accl/graph/base/base_engine.cc
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/base_engine.hh"
+
+namespace gem5
+{
+
+BaseEngine::BaseEngine(const BaseEngineParams &params) :
+    ClockedObject(params),
+    system(params.system),
+    requestorId(system->getRequestorId()),
+    memPort(name() + ".memPort", this)
+{}
+
+
+void
+BaseEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+
+}
+
+void
+BaseEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
new file mode 100644
index 0000000000..f9f500e118
--- /dev/null
+++ b/src/accl/graph/base/base_engine.hh
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "mem/request.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/port.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+    System* system;
+    const RequestorID requestorId;
+    MemPort memPort;
+
+  protected:
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+  public:
+    PARAMS(BaseEngine);
+
+    BaseEngine(const BaseEngineParams &params);
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index bb43836ff7..5bb0dc0c25 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -34,4 +34,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_header = "accl/graph/sega/apply_engine.hh"
     cxx_class = 'gem5::ApplyEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this ApplyEngine")
+    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")

From 1bf60b6fa044f8913814d4234e4a209f6076fa1d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Feb 2022 21:44:29 -0800
Subject: [PATCH 050/287] Cont. fixing pointer issue.

---
 src/accl/graph/base/BaseApplyEngine.py   |  2 -
 src/accl/graph/base/BaseWLEngine.py      |  7 +--
 src/accl/graph/base/base_apply_engine.cc | 53 ++++++++--------
 src/accl/graph/base/base_apply_engine.hh |  4 +-
 src/accl/graph/base/base_wl_engine.cc    | 79 +++++++-----------------
 src/accl/graph/base/base_wl_engine.hh    | 52 +++-------------
 6 files changed, 63 insertions(+), 134 deletions(-)

diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/BaseApplyEngine.py
index be849ed1af..9b240581ac 100644
--- a/src/accl/graph/base/BaseApplyEngine.py
+++ b/src/accl/graph/base/BaseApplyEngine.py
@@ -34,5 +34,3 @@ class BaseApplyEngine(BaseEngine):
     type = 'BaseApplyEngine'
     cxx_header = 'accl/graph/base/base_apply_engine.hh'
     cxx_class = 'gem5::BaseApplyEngine'
-
-    applyQueueSize = Param.Unsigned(32, "Size of write queue")
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/BaseWLEngine.py
index ec34b52005..7311c396b3 100644
--- a/src/accl/graph/base/BaseWLEngine.py
+++ b/src/accl/graph/base/BaseWLEngine.py
@@ -27,13 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BaseWLEngine(ClockedObject):
+class BaseWLEngine(BaseEngine):
     abstract = True
     type = 'BaseWLEngine'
     cxx_header = "accl/graph/base/base_wl_engine.hh"
     cxx_class = 'gem5::BaseWLEngine'
-
-    wlQueueSize = Param.Unsigned(32, "Size of write queue")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 4fd53fb037..7f6c32cf39 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,34 +37,35 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    queueSize(params.applyQueueSize),
     nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
     nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
-bool BaseApplyEngine::recvWLNotif(Addr addr){
+bool
+BaseApplyEngine::recvWLNotif(Addr addr)
+{
     // TODO: Investigate the situation where the queue is full.
-    // if (applyReadQueue.size() == queueSize){
-    //     //  applyReadQueue.sendPktRetry = true;
-    //     return true;
-    // } else{
     applyReadQueue.push(addr);
-    // }
     if (!nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
     }
     return true;
 }
 
-void BaseApplyEngine::processNextApplyCheckEvent(){
+void
+BaseApplyEngine::processNextApplyCheckEvent()
+{
+    // TODO: We might want to change the way this function
+    // pops items off queue, maybe we should pop every n cycles
+    // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     int req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
-    // FIXME: sendMemReq returns void, use memPortBlocked to check instead.
-    if (sendMemReq(memPkt)){
+    if (!memPortBlocked()) {
+        sendMemReq(memPkt);
         applyReadQueue.pop();
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
@@ -75,7 +76,6 @@ void BaseApplyEngine::processNextApplyCheckEvent(){
 bool
 BaseApplyEngine::handleMemResp(PacketPtr pkt)
 {
-    // FIXME: change the event, remove the retry parts
     applyWriteQueue.push(pkt);
     if(!nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
@@ -84,38 +84,39 @@ BaseApplyEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseApplyEngine::processNextApplyEvent(){
+BaseApplyEngine::processNextApplyEvent()
+{
     PacketPtr pkt = applyWriteQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
     int request_offset = requestOffset[request];
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    uint32_t prop = wl.prop;
-    uint32_t temp_prop = wl.temp_prop;
 
-    if (temp_prop != prop) {
+    WorkListItem wl = memoryToWorkList(data + request_offset);
+    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
+    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
+    // fwd it to applyengine?
+    if (wl.temp_prop < wl.prop) {
         // TODO: instead of min add a Reduce function.
         //update prop with temp_prop
-        if(prop < temp_prop) {
-            wl.prop = prop;
-        }else {
-            wl.prop = temp_prop;
-        }
+        wl.prop = wl.temp_prop;
         //write back the new worklist item to  memory
         uint8_t* wList = workListToMemory(wl);
         memcpy(data + request_offset, wList, sizeof(WorkListItem));
         //Create memory write requests.
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-            applyWriteQueue.pop();
+
+        if (!memPortBlocked()) {
+            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
+                sendMemReq(writePkt);
+                applyWriteQueue.pop();
+            }
         }
-    }else {
+    } else {
         applyWriteQueue.pop();
     }
-    if(!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
+    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
         schedule(nextApplyEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index f81f23428e..dc7188ab56 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -37,7 +37,6 @@
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 
 namespace gem5
@@ -60,6 +59,7 @@ class BaseApplyEngine : public BaseEngine
     void processNextApplyEvent();
 
   protected:
+    virtual bool handleMemResp(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
 
   public:
@@ -71,7 +71,7 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 806ab4a6c3..aab39fb7a3 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -34,61 +34,37 @@ namespace gem5
 {
 
 BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    ClockedObject(params),
-    requestorId(-1),
-    updateQueue(params.wlQueueSize),
-    responseQueue(params.wlQueueSize),
+    BaseEngine(params),
     nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
     nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
 {}
 
-Port &
-BaseWLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BaseWLEngine::getRequestorId()
+bool
+BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 {
-    return requestorId;
+    updateQueue.push(pkt);
+    if(!nextWLReadEvent.scheduled()) {
+        schedule(nextWLReadEvent, nextCycle());
+    }
+    return true;
 }
 
-void
-BaseWLEngine::setRequestorId(RequestorID requestorId)
+void BaseWLEngine::processNextWLReadEvent()
 {
-    this->requestorId = requestorId;
-}
+    PacketPtr pkt = updateQueue.front();
 
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt){
-    auto queue = updateQueue;
-    if (queue.blocked()){
-        queue.sendPktRetry = true;
-        return false;
-    } else
-        queue.push(pkt);
+    Addr addr = pkt->getAddr();
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
 
-    if(!nextWLReadEvent.scheduled()){
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffsetMap[request] = req_offset;
 
-void BaseWLEngine::processNextWLReadEvent(){
-    auto queue = updateQueue;
-    PacketPtr pkt = queue.front();
-    /// conver to ReadReq
-    Addr req_addr = (pkt->getAddr() / 64) * 64;
-    int req_offset = (pkt->getAddr()) % 64;
-    RequestPtr request =
-        std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
-    if (sendMemReq(memPkt)){
-        queue.pop();
+    if (memPortBlocked()) {
+        sendMemReq(memPkt)
+        updateQueue.pop();
     }
-    if(!queue.empty() && !nextWLReadEvent.scheduled()){
+    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -96,24 +72,15 @@ void BaseWLEngine::processNextWLReadEvent(){
 bool
 BaseWLEngine::handleMemResp(PacketPtr pkt)
 {
-    auto queue = responseQueue;
-        if (queue.blocked()){
-            queue.sendPktRetry = true;
-            return false;
-        } else{
-            queue.push(pkt);
-        }
-        if(!nextWLReduceEvent.scheduled()){
-            schedule(nextWLReduceEvent, nextCycle());
-        }
-        return true;
+    responseQueue.push(pkt);
+    if(!nextWLReduceEvent.scheduled()){
+        schedule(nextWLReduceEvent, nextCycle());
+    }
     return true;
 }
 
 void
 BaseWLEngine::processNextWLReduceEvent(){
-    auto queue = responseQueue;
-    auto updateQ = updateQueue;
     PacketPtr update = updateQ.front();
     uint8_t* value = update->getPtr<uint8_t>();
     PacketPtr pkt = queue.front();
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 4cb492914c..063e9909be 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -32,57 +32,26 @@
 #include <queue>
 #include <unordered_map>
 
+#include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/clocked_object.hh"
 #include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseWLEngine : public ClockedObject
+class BaseWLEngine : public BaseEngine
 {
   private:
-    //FIXME: Change this
-    struct WLQueue{
-      std::queue<PacketPtr> wlQueue;
-      uint32_t queueSize;
-      bool sendPktRetry;
-
-      void resize(uint32_t size){
-        queueSize = size;
-      }
-
-      bool blocked(){
-        return (wlQueue.size() == queueSize);
-      }
-      bool empty(){
-        return wlQueue.empty();
-      }
-      void push(PacketPtr pkt){
-        wlQueue.push(pkt);
-      }
-      void pop(){
-        wlQueue.pop();
-      }
-      PacketPtr front(){
-        return wlQueue.front();
-      }
-
-      WLQueue(uint32_t qSize):
-        queueSize(qSize),
-        sendPktRetry(false){}
-    };
-
-    RequestorID requestorId;
-    WLQueue updateQueue;
-    WLQueue responseQueue;
-
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::queue<PacketPtr> updateQueue;
+    std::queue<PacketPtr> responseQueue;
+
+    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
+    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
 
     //Events
     EventFunctionWrapper nextWLReadEvent;
@@ -100,7 +69,7 @@ class BaseWLEngine : public ClockedObject
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
 
   public:
@@ -112,11 +81,8 @@ class BaseWLEngine : public ClockedObject
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool handleWLUpdate(PacketPtr pkt);
-    bool handleMemResp(PacketPtr resp);
+
 };
 
 }

From a8a3d0dc91778cbb21553938f7b3840e2d2af979 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 10:16:01 -0800
Subject: [PATCH 051/287] Cont. fix pointer issue.

---
 src/accl/graph/base/BasePushEngine.py    |  6 +--
 src/accl/graph/base/base_apply_engine.hh |  1 -
 src/accl/graph/base/base_push_engine.cc  | 19 -------
 src/accl/graph/base/base_push_engine.hh  | 19 ++-----
 src/accl/graph/base/base_wl_engine.cc    | 64 +++++++++---------------
 5 files changed, 31 insertions(+), 78 deletions(-)

diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/BasePushEngine.py
index c52a65abf9..2163864be3 100644
--- a/src/accl/graph/base/BasePushEngine.py
+++ b/src/accl/graph/base/BasePushEngine.py
@@ -27,12 +27,10 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseEngine import BaseEngine
 
-class BasePushEngine(ClockedObject):
+class BasePushEngine(BaseEngine):
     abstract = True
     type = 'BasePushEngine'
     cxx_header = "accl/graph/base/base_push_engine.hh"
     cxx_class = 'gem5::BasePushEngine'
-
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index dc7188ab56..2cb9d8b918 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -48,7 +48,6 @@ class BaseApplyEngine : public BaseEngine
 
     std::queue<Addr> applyReadQueue;
     std::queue<PacketPtr> applyWriteQueue;
-    int queueSize;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index d93cbdf8da..f2384c434b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,6 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     ClockedObject(params),
-    requestorId(-1),
     // vertexQueueSize(params.vertex_queue_size),
     // vertexQueueLen(0),
     // updateQueue(params.update_queue_size),
@@ -46,24 +45,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
 {
 }
 
-Port &
-BasePushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-RequestorID
-BasePushEngine::getRequestorId()
-{
-    return requestorId;
-}
-
-void
-BasePushEngine::setRequestorId(RequestorID requestorId)
-{
-    this->requestorId = requestorId;
-}
-
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 446f6a1186..f568b6ecc3 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -31,16 +31,16 @@
 
 #include <queue>
 
+#include "accl/graph/base/base_engine.hh"
 #include "mem/port.hh"
 #include "mem/request.hh"
 #include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
-#include "sim/clocked_object.hh"
 
 namespace gem5
 {
 
-class BasePushEngine : public ClockedObject
+class BasePushEngine : public BaseEngine
 {
   private:
 
@@ -53,9 +53,6 @@ class BasePushEngine : public ClockedObject
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
-
-    RequestorID requestorId;
-
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
     // int vertexQueueLen;
@@ -64,8 +61,6 @@ class BasePushEngine : public ClockedObject
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> memReqQueue; // Infinite queueing?
-
     std::queue<PacketPtr> updateQueue;
     // int updateQueueSize;
     // int updateQueueLen;
@@ -80,8 +75,8 @@ class BasePushEngine : public ClockedObject
     void processNextSendEvent();
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt) = 0;
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
 
@@ -89,14 +84,8 @@ class BasePushEngine : public ClockedObject
 
     BasePushEngine(const BasePushEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    RequestorID getRequestorId();
-    void setRequestorId(RequestorID requestorId);
-
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-    bool handleMemResp(PacketPtr pkt);
+
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index aab39fb7a3..d5b18bafa0 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,13 +52,15 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
+    uint32_t data = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
 
     PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[request] = req_offset;
+    requestOffsetMap[memPkt->req] = req_offset;
+    requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
         sendMemReq(memPkt)
@@ -80,51 +82,35 @@ BaseWLEngine::handleMemResp(PacketPtr pkt)
 }
 
 void
-BaseWLEngine::processNextWLReduceEvent(){
-    PacketPtr update = updateQ.front();
-    uint8_t* value = update->getPtr<uint8_t>();
-    PacketPtr pkt = queue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-    RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+BaseWLEngine::processNextWLReduceEvent()
+{
+    PacketPtr resp = responseQueue.front();
+    uint8_t* respData = resp->getPtr<uint8_t>();
+    Addr request_offset = requestOffsetMap[resp->req];
+    uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(data + request_offset);
-    uint32_t temp_prop = wl.temp_prop;
-    if (temp_prop != *value){
+
+    if (value < wl.temp_prop){
         //update prop with temp_prop
-        if(*value < temp_prop){
-            temp_prop = *value;
-        }
-        // if (!memPort.blocked() && !applyPort.blocked()){
-        wl.temp_prop = temp_prop;
-        uint8_t* wlItem = workListToMemory(wl);
-        memcpy(data + request_offset, wlItem, sizeof(WorkListItem));
+        wl.temp_prop = value;
+
+        uint8_t* wlData = workListToMemory(wl);
+        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-        if (sendMemReq(writePkt) &&
-            sendWLNotif(writePkt->getAddr())) {
-            queue.pop();
-            if (!queue.blocked() && queue.sendPktRetry){
-                queue.sendPktRetry = false;
-            }
-            updateQ.pop();
-            if (!updateQ.blocked() & updateQ.sendPktRetry){
-                // respPort.trySendRetry();
-                updateQ.sendPktRetry = false;
+        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+
+        if (!memPortBlocked()) {
+            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+                sendMemReq(writePkt);
+                responseQueue.pop();
+                // TODO: Erase map entries, delete wlData;
             }
         }
     }
-    else{
-        queue.pop();
-        if (!queue.blocked() && queue.sendPktRetry){
-            queue.sendPktRetry = false;
-        }
-        updateQ.pop();
-        if (!updateQ.blocked() & updateQ.sendPktRetry){
-            updateQ.sendPktRetry = false;
-        }
-
+    else {
+        responseQueue.pop();
     }
-    if (!queue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }

From 5a595540a569128ec01d730c25f4091a0a7c3a6f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:07:21 -0800
Subject: [PATCH 052/287] Cont. fix pointer issue. MemQ to BaseEngine.

---
 src/accl/graph/base/base_apply_engine.cc | 22 ++-----
 src/accl/graph/base/base_apply_engine.hh | 11 +---
 src/accl/graph/base/base_engine.cc       | 13 +++-
 src/accl/graph/base/base_engine.hh       | 17 +++++-
 src/accl/graph/base/base_push_engine.cc  | 77 ++++++------------------
 src/accl/graph/base/base_push_engine.hh  | 16 +----
 src/accl/graph/base/base_wl_engine.cc    | 22 ++-----
 src/accl/graph/base/base_wl_engine.hh    |  3 +-
 src/accl/graph/sega/mpu.hh               |  2 -
 9 files changed, 65 insertions(+), 118 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7f6c32cf39..842481c2d1 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -73,20 +73,10 @@ BaseApplyEngine::processNextApplyCheckEvent()
     }
 }
 
-bool
-BaseApplyEngine::handleMemResp(PacketPtr pkt)
-{
-    applyWriteQueue.push(pkt);
-    if(!nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseApplyEngine::processNextApplyEvent()
+BaseApplyEngine::processNextMemRespEvent()
 {
-    PacketPtr pkt = applyWriteQueue.front();
+    PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
@@ -110,14 +100,14 @@ BaseApplyEngine::processNextApplyEvent()
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
-                applyWriteQueue.pop();
+                memRespQueue.pop();
             }
         }
     } else {
-        applyWriteQueue.pop();
+        memRespQueue.pop();
     }
-    if (!applyWriteQueue.empty() && !nextApplyEvent.scheduled()){
-        schedule(nextApplyEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 2cb9d8b918..02646a74ff 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -45,21 +45,17 @@ namespace gem5
 class BaseApplyEngine : public BaseEngine
 {
   private:
-
     std::queue<Addr> applyReadQueue;
-    std::queue<PacketPtr> applyWriteQueue;
 
     std::unordered_map<RequestPtr, int> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
   protected:
-    virtual bool handleMemResp(PacketPtr pkt);
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual bool sendApplyNotif(uint32_t prop,
+            uint32_t degree, uint32_t edgeIndex) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
     PARAMS(BaseApplyEngine);
@@ -70,7 +66,6 @@ class BaseApplyEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool recvWLNotif(Addr addr);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index d53e2e683a..6a50e1630e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,7 +35,8 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     requestorId(system->getRequestorId()),
-    memPort(name() + ".memPort", this)
+    memPort(name() + ".memPort", this),
+    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
 
@@ -72,4 +73,14 @@ BaseEngine::MemPort::recvReqRetry()
     }
 }
 
+bool
+BaseEngine::handleMemResp(PacketPtr pkt)
+{
+    memRespQueue.push(pkt);
+    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemResponseEvent, nextCycle());
+    }
+    return true;
+}
+
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index f9f500e118..4f5a29676d 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -66,14 +66,28 @@ class BaseEngine : public ClockedObject
         virtual bool recvTimingResp(PacketPtr pkt);
         virtual void recvReqRetry();
     };
+
     System* system;
     const RequestorID requestorId;
     MemPort memPort;
 
+    bool handleMemResp(PacketPtr resp);
+    EventFunctionWrapper nextMemRespEvent;
+
   protected:
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-    virtual bool handleMemResp(PacketPtr resp) = 0;
+
+    // TODO: Add this later, maybe?
+    // int memRespQueueSize;
+    std::queue<PacketPtr> memRespQueue;
+    /* Respective function for nextMemRespEvent.
+    All the classes inheriting from this class will
+    do their main processing in this function. For
+    example, BaseWLEngine reduces the temp_pro with
+    the value of update in this function.
+    */
+    virtual void processNextMemRespEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
@@ -82,7 +96,6 @@ class BaseEngine : public ClockedObject
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f2384c434b..4c43f95939 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -40,7 +40,6 @@ BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     // updateQueue(params.update_queue_size),
     // updateQueueLen(0),
     nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextSendEvent([this] { processNextSendEvent(); }, name())
 {
 }
@@ -49,16 +48,6 @@ bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
-    //FIXME: There should be a check if the queues are full.
-    // if (vertexQueueLen < vertexQueueSize) {
-    //     vertexQueue.push(pkt)
-    //         vertexQueueLen++;
-    //     if (!nextReceiveEvent.scheduled()) {
-    //         schedule(nextReceiveEvent, nextCycle());
-    //     }
-    //     return true;
-    // }
-    // return false;
     notifQueue.emplace(prop, degree, edge_index);
     if (!nextReceiveEvent.scheduled()) {
         schedule(nextReceiveEvent, nextCycle());
@@ -67,7 +56,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
 }
 
 void
-BasePushEngine::processNextReceiveEvent()
+BasePushEngine::processNextReadEvent()
 {
     ApplyNotif notif = notifQueue.front();
 
@@ -95,39 +84,28 @@ BasePushEngine::processNextReceiveEvent()
             offset_queue.push_back(req_offset);
             num_edge_queue.push_back(1);
         }
-    }
+    };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-        memReqQueue.push(pkt);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = notif.prop;
+        if (!memPortBlocked()) {
+            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+            reqOffsetMap[pkt->req] = offset_queue[index];
+            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+            reqValueMap[pkt->req] = notif.prop;
+            sendMemReq(pkt);
+            notifQueue.pop();
+        }
     }
 
-    notifQueue.pop();
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
+    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-BasePushEngine::processNextReadEvent()
-{
-    PacketPtr pkt = memReqQueue.front();
-    if (!sendMemReq(pkt)) {
-        memReqQueue.pop();
-    }
-
-    if (!nextReadEvent.scheduled() && !memReqQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-bool
-BasePushEngine::handleMemResp(PacketPtr pkt)
+BasePushEngine::processNextMemRespEvent()
 {
+    PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
@@ -137,7 +115,7 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
 
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + i * edge_in_bytes;
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
         uint32_t *update_data = new uint32_t;
 
@@ -146,29 +124,14 @@ BasePushEngine::handleMemResp(PacketPtr pkt)
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        updateQueue.push(update);
-    }
-
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
-    }
-
-    //TODO: Should we always return true? It's the response from the memory
-    // so maybe yes. We assume the receiving bandwidth of the PushEngine is
-    // higher than its demand bandwidth
-    return true;
-}
-
-void
-BasePushEngine::processNextSendEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    if (!sendPushUpdate(pkt)) {
-        updateQueue.pop();
+        if (sendPushUpdate(update)) {
+            memRespQueue.pop();
+            // TODO: Erase map entries here.
+        }
     }
 
-    if (!nextSendEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextSendEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index f568b6ecc3..5a6ef85b0f 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -43,7 +43,6 @@ namespace gem5
 class BasePushEngine : public BaseEngine
 {
   private:
-
     struct ApplyNotif {
         uint32_t prop;
         uint32_t degree;
@@ -53,30 +52,20 @@ class BasePushEngine : public BaseEngine
         prop(prop), degree(degree), edgeIndex(edge_index)
         {}
     };
+
     std::queue<ApplyNotif> notifQueue;
     // int vertexQueueSize;
-    // int vertexQueueLen;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    std::queue<PacketPtr> updateQueue;
-    // int updateQueueSize;
-    // int updateQueueLen;
-
-    EventFunctionWrapper nextReceiveEvent;
-    void processNextReceiveEvent();
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
-    EventFunctionWrapper nextSendEvent;
-    void processNextSendEvent();
-
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -85,7 +74,6 @@ class BasePushEngine : public BaseEngine
     BasePushEngine(const BasePushEngineParams &params);
 
     bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-
 };
 
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index d5b18bafa0..5d84e34ccd 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -63,7 +63,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestValueMap[memPkt->req] = value;
 
     if (memPortBlocked()) {
-        sendMemReq(memPkt)
+        sendMemReq(memPkt);
         updateQueue.pop();
     }
     if (!queue.empty() && !nextWLReadEvent.scheduled()) {
@@ -71,20 +71,10 @@ void BaseWLEngine::processNextWLReadEvent()
     }
 }
 
-bool
-BaseWLEngine::handleMemResp(PacketPtr pkt)
-{
-    responseQueue.push(pkt);
-    if(!nextWLReduceEvent.scheduled()){
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-    return true;
-}
-
 void
-BaseWLEngine::processNextWLReduceEvent()
+BaseWLEngine::processNextMemRespEvent()
 {
-    PacketPtr resp = responseQueue.front();
+    PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
@@ -102,15 +92,15 @@ BaseWLEngine::processNextWLReduceEvent()
         if (!memPortBlocked()) {
             if (sendWLNotif(pkt->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
-                responseQueue.pop();
+                memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
             }
         }
     }
     else {
-        responseQueue.pop();
+        memRespQueue.pop();
     }
-    if (!responseQueue.empty() && !nextWLReduceEvent.scheduled()){
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 063e9909be..ab8952de41 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -69,8 +69,8 @@ class BaseWLEngine : public BaseEngine
        Write edgelist loc in buffer
     */
   protected:
-    virtual bool handleMemResp(PacketPtr resp);
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual void processNextMemRespEvent();
 
   public:
 
@@ -82,7 +82,6 @@ class BaseWLEngine : public BaseEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleWLUpdate(PacketPtr pkt);
-
 };
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index cf241c9063..8b5ba20b1c 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -129,8 +129,6 @@ class MPU : public ClockedObject
     void handleMemResp(PacketPtr pkt);
 
     bool handleWLUpdate(PacketPtr pkt);
-    bool recvWLNotif(Addr addr);
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
     bool recvPushUpdate(PacketPtr pkt);
 };
 

From 4d2ad56c51ecfd4070a0800d9ec51cf5fc5aa225 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:21:51 -0800
Subject: [PATCH 053/287] Pointer issue fixed.

---
 src/accl/graph/sega/MPU.py          |  4 ---
 src/accl/graph/sega/WLEngine.py     |  3 +-
 src/accl/graph/sega/apply_engine.cc | 14 +++-----
 src/accl/graph/sega/apply_engine.hh |  7 ++--
 src/accl/graph/sega/mpu.cc          | 55 ++++-------------------------
 src/accl/graph/sega/mpu.hh          | 10 +-----
 src/accl/graph/sega/push_engine.cc  | 15 +-------
 src/accl/graph/sega/push_engine.hh  |  5 ---
 src/accl/graph/sega/wl_engine.cc    | 14 +++-----
 src/accl/graph/sega/wl_engine.hh    |  7 ++--
 10 files changed, 23 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index efd8dbc11f..71b8841b10 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -38,12 +38,8 @@ class MPU(ClockedObject):
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    apply_engine = Param.ApplyEngine(NULL, "ApplyEngine object to connect to "
-                    "This MPU")
     push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
                     "This MPU")
-    work_list_engine = Param.WLEngine(NULL, "WLEngine object to connect to "
-                    "This MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 12fbcf9b4f..3bfe9fa16f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,4 +34,5 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this WLEngine")
\ No newline at end of file
+    apply_engine = Param.ApplyEngine(Parent.any,
+            "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc45850041..0f686e7f8c 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,24 +27,20 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    mpu(params.mpu)
+    pushEngine(params.push_engine)
 {}
 
 bool
-ApplyEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
+ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
+{
+    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
 
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex){
-    mpu->recvApplyNotif(prop, degree, edgeIndex);
-    return true;
 }
 
 }
\ No newline at end of file
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index c7d3073e36..4d828c6aa1 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -42,17 +42,14 @@
 namespace gem5
 {
 
-class MPU;
+class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
   private:
-
-    MPU* mpu;
+    PushEngine* pushEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
 
   public:
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4824bcd699..23a777d1c6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -33,12 +33,9 @@ namespace gem5
 
 MPU::MPU(const MPUParams &params):
     ClockedObject(params),
-    nextRequestorId(0),
     respPort(name() + ".respPort", this),
     reqPort(name() + ".reqPort", this),
     memPort(name() + ".memPort", this),
-    applyEngine(params.apply_engine),
-    pushEngine(params.push_engine),
     wlEngine(params.work_list_engine)
 {}
 
@@ -59,16 +56,6 @@ MPU::getPort(const std::string &if_name, PortID idx)
 void
 MPU::startup()
 {
-    if (((int16_t) applyEngine->getRequestorId()) == -1) {
-        applyEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) pushEngine->getRequestorId()) == -1) {
-        pushEngine->setRequestorId(nextRequestorId++);
-    }
-    if (((int16_t) wlEngine->getRequestorId()) == -1) {
-        wlEngine->setRequestorId(nextRequestorId++);
-    }
-
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
@@ -177,9 +164,7 @@ MPU::MPUMemPort::sendPacket(PacketPtr pkt)
 bool
 MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
 {
-    //TODO: Investigate sending true all the time
-    owner->handleMemResp(pkt);
-    return true;
+    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
 }
 
 void
@@ -224,16 +209,7 @@ MPU::handleMemReq(PacketPtr pkt)
 void
 MPU::handleMemResp(PacketPtr pkt)
 {
-    RequestorID requestorId = pkt->requestorId();
-    if (applyEngine->getRequestorId() == requestorId) {
-        applyEngine->handleMemResp(pkt);
-    } else if (pushEngine->getRequestorId() == requestorId) {
-        pushEngine->handleMemResp(pkt);
-    } else if (wlEngine->getRequestorId() == requestorId) {
-        wlEngine->handleMemResp(pkt);
-    } else {
-        panic("Received a response with an unknown requestorId.");
-    }
+    panic("MPU::handleMemResp called!");
 }
 
 bool
@@ -242,39 +218,20 @@ MPU::handleWLUpdate(PacketPtr pkt)
     return wlEngine->handleWLUpdate(pkt);
 }
 
-bool
-MPU::recvWLNotif(Addr addr)
-{
-    return applyEngine->recvWLNotif(addr);
-}
-
-bool
-MPU::recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edge_index);
-}
-
 bool
 MPU::recvPushUpdate(PacketPtr pkt)
 {
     Addr addr = pkt->getAddr();
     for (auto addr_range: memPort.getAddrRanges()) {
         if (addr_range.contains(addr)) {
-            if (memPort.blocked()) {
-                return false;
-            } else {
-                memPort.sendPacket(pkt);
-                return true;
-            }
+            return handleWLUpdate(pkt);
         }
     }
-
-    if (reqPort.blocked()) {
-        return false;
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
     }
-    reqPort.sendPacket(pkt);
     return true;
-
 }
 
 }
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8b5ba20b1c..2df8993749 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -103,18 +103,13 @@ class MPU : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    virtual void startup();
-
-    RequestorID nextRequestorId;
-
     MPURespPort respPort;
     MPUReqPort reqPort;
     MPUMemPort memPort;
 
-    ApplyEngine* applyEngine;
-    PushEngine* pushEngine;
     WLEngine* wlEngine;
 
+    virtual void startup();
     AddrRangeList getAddrRanges();
     void recvFunctional(PacketPtr pkt);
 
@@ -125,9 +120,6 @@ class MPU : public ClockedObject
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool handleMemReq(PacketPtr pkt);
-    void handleMemResp(PacketPtr pkt);
-
     bool handleWLUpdate(PacketPtr pkt);
     bool recvPushUpdate(PacketPtr pkt);
 };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 922ae32ed2..71cb2955fd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,20 +35,7 @@ namespace gem5
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
     mpu(params.mpu)
-{
-}
-
-Port &
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    return SimObject::getPort(if_name, idx);
-}
-
-bool
-PushEngine::sendMemReq(PacketPtr pkt)
-{
-    return mpu->handleMemReq(pkt);
-}
+{}
 
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1a800e58f3..7b3474d2ec 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -43,16 +43,11 @@ class PushEngine : public BasePushEngine
     MPU* mpu;
 
   protected:
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendPushUpdate(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40ec755969..3d9d7af0c6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,25 +27,19 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/apply_engine.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
-    mpu(params.mpu)
+    applyEngine(params.apply_engine)
 {}
 
-bool
-WLEngine::sendMemReq(PacketPtr pkt){
-    return mpu->handleMemReq(pkt);
-}
-
-// FIXME: handle the case where Apply queue is full
 bool
 WLEngine::sendWLNotif(Addr addr){
-    mpu->recvWLNotif(addr);
-    return true;
+    apply_engine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 238ffbe724..c154867b0d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,17 +45,14 @@
 namespace gem5
 {
 
-// class MPU;
+class ApplyEngine;
 
 class WLEngine : public BaseWLEngine
 {
   private:
-
-    MPU* mpu;
+    ApplyEngine* applyEngine;
 
   protected:
-
-    virtual bool sendMemReq(PacketPtr pkt);
     virtual bool sendWLNotif(Addr addr);
 
   public:

From 39883a68c9f8c2895ce9c0a5315dd9cf4eec7a9c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:24:07 -0800
Subject: [PATCH 054/287] Adding BaseEngine to SConscript.

---
 src/accl/graph/base/SConscript | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index cc55100064..41c48fc419 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -28,10 +28,12 @@
 Import('*')
 
 SimObject('BaseApplyEngine.py')
+SimObject('BaseEngine.py')
 SimObject('BasePushEngine.py')
 SimObject('BaseWLEngine.py')
 
 Source('base_apply_engine.cc')
+Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')

From adfa21a1a8b9ee69b7e75dab14e8db2f1be7e2ca Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 12:43:55 -0800
Subject: [PATCH 055/287] Compilation issues fixed. Still linking issues.

---
 src/accl/graph/base/BaseEngine.py        |  1 +
 src/accl/graph/base/base_apply_engine.cc |  3 +--
 src/accl/graph/base/base_engine.cc       |  6 +++---
 src/accl/graph/base/base_engine.hh       | 14 +++++++-------
 src/accl/graph/base/base_push_engine.cc  | 16 +++++-----------
 src/accl/graph/base/base_wl_engine.cc    | 10 +++++-----
 src/accl/graph/sega/MPU.py               |  8 ++------
 src/accl/graph/sega/apply_engine.cc      |  3 +--
 src/accl/graph/sega/apply_engine.hh      |  2 +-
 src/accl/graph/sega/mpu.cc               | 14 --------------
 src/accl/graph/sega/mpu.hh               |  2 --
 src/accl/graph/sega/push_engine.cc       |  1 -
 src/accl/graph/sega/push_engine.hh       |  1 +
 src/accl/graph/sega/wl_engine.cc         |  3 +--
 src/accl/graph/sega/wl_engine.hh         |  1 +
 15 files changed, 29 insertions(+), 56 deletions(-)

diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 3eb5f0cbbc..367df8dbc1 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -35,4 +35,5 @@ class BaseEngine(ClockedObject):
     cxx_header = "accl/graph/base/base_engine.hh"
     cxx_class = 'gem5::BaseEngine'
 
+    system = Param.System(Parent.any, 'System this Engine is a part of')
     memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 842481c2d1..b7f3030e00 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,8 +37,7 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
 {}
 
 bool
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6a50e1630e..06827c1d4e 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -34,8 +34,8 @@ namespace gem5
 BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
-    requestorId(system->getRequestorId()),
     memPort(name() + ".memPort", this),
+    requestorId(system->getRequestorId(this)),
     nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
 {}
 
@@ -77,8 +77,8 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemResponseEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemResponseEvent, nextCycle());
+    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextMemRespEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 4f5a29676d..057a4c6d91 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -68,25 +68,25 @@ class BaseEngine : public ClockedObject
     };
 
     System* system;
-    const RequestorID requestorId;
     MemPort memPort;
 
     bool handleMemResp(PacketPtr resp);
-    EventFunctionWrapper nextMemRespEvent;
 
   protected:
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
-
+    const RequestorID requestorId;
     // TODO: Add this later, maybe?
     // int memRespQueueSize;
     std::queue<PacketPtr> memRespQueue;
-    /* Respective function for nextMemRespEvent.
-    All the classes inheriting from this class will
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+
+    /* All the classes inheriting from this class will
     do their main processing in this function. For
     example, BaseWLEngine reduces the temp_pro with
     the value of update in this function.
     */
+    EventFunctionWrapper nextMemRespEvent;
     virtual void processNextMemRespEvent() = 0;
 
   public:
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 4c43f95939..187eefe01b 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -34,23 +34,17 @@ namespace gem5
 {
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    ClockedObject(params),
-    // vertexQueueSize(params.vertex_queue_size),
-    // vertexQueueLen(0),
-    // updateQueue(params.update_queue_size),
-    // updateQueueLen(0),
-    nextReceiveEvent([this] { processNextReceiveEvent(); }, name()),
-    nextSendEvent([this] { processNextSendEvent(); }, name())
-{
-}
+    BaseEngine(params),
+    nextReadEvent([this] { processNextReadEvent(); }, name())
+{}
 
 bool
 BasePushEngine::recvApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edge_index)
 {
     notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReceiveEvent.scheduled()) {
-        schedule(nextReceiveEvent, nextCycle());
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
     }
     return true;
 }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 5d84e34ccd..20abaa7b20 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -52,7 +52,7 @@ BaseWLEngine::handleWLUpdate(PacketPtr pkt)
 void BaseWLEngine::processNextWLReadEvent()
 {
     PacketPtr pkt = updateQueue.front();
-    uint32_t data = *(pkt->getPtr<uint32_t>());
+    uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
@@ -66,7 +66,7 @@ void BaseWLEngine::processNextWLReadEvent()
         sendMemReq(memPkt);
         updateQueue.pop();
     }
-    if (!queue.empty() && !nextWLReadEvent.scheduled()) {
+    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
     }
 }
@@ -78,7 +78,7 @@ BaseWLEngine::processNextMemRespEvent()
     uint8_t* respData = resp->getPtr<uint8_t>();
     Addr request_offset = requestOffsetMap[resp->req];
     uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(data + request_offset);
+    WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
     if (value < wl.temp_prop){
         //update prop with temp_prop
@@ -87,10 +87,10 @@ BaseWLEngine::processNextMemRespEvent()
         uint8_t* wlData = workListToMemory(wl);
         memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
         PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, respData, requestorId);
+        getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
         if (!memPortBlocked()) {
-            if (sendWLNotif(pkt->getAddr() + request_offset)) {
+            if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
                 // TODO: Erase map entries, delete wlData;
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 71b8841b10..87de0fb7d6 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -28,18 +28,14 @@
 from m5.params import *
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
-
-# from m5.objects.WLEngine import WLEngine
-# from m5.objects.PushEngine import PushEngine
-# from m5.objects.ApplyEngine import ApplyEngine
+from m5.objects.WLEngine import WLEngine
 
 class MPU(ClockedObject):
     type = 'MPU'
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = 'gem5::MPU'
 
-    push_engine = Param.PushEngine(NULL, "PushEngine object to connect to "
-                    "This MPU")
+    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
 
     respPort = ResponsePort("Port to Receive updates from outside")
     reqPort  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 0f686e7f8c..bc3d703cf6 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 
 namespace gem5{
 
@@ -39,7 +38,7 @@ ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
 bool
 ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 {
-    return push_engine->recvApplyNotif(prop, degree, edgeIndex);
+    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
 
 }
 
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 4d828c6aa1..aff2c5417b 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/ApplyEngine.hh"
@@ -42,7 +43,6 @@
 namespace gem5
 {
 
-class PushEngine;
 
 class ApplyEngine : public BaseApplyEngine
 {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 23a777d1c6..9bda696cb5 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -198,20 +198,6 @@ MPU::recvFunctional(PacketPtr pkt)
     }
 }
 
-bool
-MPU::handleMemReq(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    memPort.sendPacket(pkt);
-    return true;
-}
-
-void
-MPU::handleMemResp(PacketPtr pkt)
-{
-    panic("MPU::handleMemResp called!");
-}
-
 bool
 MPU::handleWLUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 2df8993749..a0472eead5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 71cb2955fd..a1fa86da2b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7b3474d2ec..edf698011d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3d9d7af0c6..823aa49bb9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,6 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
 
 namespace gem5
 {
@@ -39,7 +38,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
 
 bool
 WLEngine::sendWLNotif(Addr addr){
-    apply_engine->recvWLNotif(addr);
+    return applyEngine->recvWLNotif(addr);
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c154867b0d..6946713aaa 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/port.hh"
 #include "mem/packet.hh"

From 05771a071f7016fe66fc0da8e551ef793ac0c059 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:19:29 -0800
Subject: [PATCH 056/287] Removing unnecessary includes.

---
 src/accl/graph/base/base_apply_engine.cc | 4 ++--
 src/accl/graph/base/base_apply_engine.hh | 5 +----
 src/accl/graph/base/base_engine.hh       | 4 +---
 src/accl/graph/base/base_push_engine.hh  | 4 +---
 src/accl/graph/base/base_wl_engine.hh    | 8 --------
 5 files changed, 5 insertions(+), 20 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index b7f3030e00..009c01ccb7 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -59,7 +59,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
-    int req_offset = (addr % 64);
+    Addr req_offset = (addr % 64);
     RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
     PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
     requestOffset[request] = req_offset;
@@ -79,7 +79,7 @@ BaseApplyEngine::processNextMemRespEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     RequestPtr request = pkt->req;
-    int request_offset = requestOffset[request];
+    Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 02646a74ff..e3fe47d923 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -33,11 +33,8 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
 #include "params/BaseApplyEngine.hh"
-#include "sim/port.hh"
 
 namespace gem5
 {
@@ -47,7 +44,7 @@ class BaseApplyEngine : public BaseEngine
   private:
     std::queue<Addr> applyReadQueue;
 
-    std::unordered_map<RequestPtr, int> requestOffset;
+    std::unordered_map<RequestPtr, Addr> requestOffset;
 
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 057a4c6d91..b0b05d9477 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -34,10 +34,8 @@
 
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "mem/request.hh"
 #include "params/BaseEngine.hh"
 #include "sim/clocked_object.hh"
-#include "sim/port.hh"
 #include "sim/system.hh"
 
 namespace gem5
@@ -79,7 +77,7 @@ class BaseEngine : public ClockedObject
     std::queue<PacketPtr> memRespQueue;
 
     bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) {memPort.sendPacket(pkt); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
     /* All the classes inheriting from this class will
     do their main processing in this function. For
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 5a6ef85b0f..0da4241dfd 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -32,9 +32,7 @@
 #include <queue>
 
 #include "accl/graph/base/base_engine.hh"
-#include "mem/port.hh"
 #include "mem/request.hh"
-#include "mem/packet.hh"
 #include "params/BasePushEngine.hh"
 
 namespace gem5
@@ -54,7 +52,7 @@ class BasePushEngine : public BaseEngine
     };
 
     std::queue<ApplyNotif> notifQueue;
-    // int vertexQueueSize;
+    // int notifQueueSize;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index ab8952de41..3ca9a146a1 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -34,12 +34,7 @@
 
 #include "accl/graph/base/base_engine.hh"
 #include "accl/graph/base/util.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/BaseWLEngine.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
@@ -78,9 +73,6 @@ class BaseWLEngine : public BaseEngine
 
     BaseWLEngine(const BaseWLEngineParams &params);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool handleWLUpdate(PacketPtr pkt);
 };
 

From 01b4b2a5a80247c969243bbb52bbbe9bd4ef41f8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 13:51:30 -0800
Subject: [PATCH 057/287] Fixing the issue of calling pure virtual function.

---
 src/accl/graph/base/base_apply_engine.cc | 17 +++++++++++++----
 src/accl/graph/base/base_apply_engine.hh |  6 +++++-
 src/accl/graph/base/base_engine.cc       |  7 ++-----
 src/accl/graph/base/base_engine.hh       |  8 +-------
 src/accl/graph/base/base_push_engine.cc  | 17 +++++++++++++----
 src/accl/graph/base/base_push_engine.hh  |  5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 13 +++++++++++--
 src/accl/graph/base/base_wl_engine.hh    |  2 +-
 8 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 009c01ccb7..e7b7dd6a22 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -37,7 +37,8 @@ namespace gem5
 
 BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
     BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name())
+    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
+    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
 {}
 
 bool
@@ -73,7 +74,7 @@ BaseApplyEngine::processNextApplyCheckEvent()
 }
 
 void
-BaseApplyEngine::processNextMemRespEvent()
+BaseApplyEngine::processNextApplyEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     uint8_t* data = pkt->getPtr<uint8_t>();
@@ -105,8 +106,16 @@ BaseApplyEngine::processNextMemRespEvent()
     } else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+BaseApplyEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index e3fe47d923..486fb687fe 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -49,10 +49,14 @@ class BaseApplyEngine : public BaseEngine
     EventFunctionWrapper nextApplyCheckEvent;
     void processNextApplyCheckEvent();
 
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual void processNextMemRespEvent();
+
+    virtual void scheduleMainEvent();
 
   public:
     PARAMS(BaseApplyEngine);
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 06827c1d4e..245192643c 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -35,8 +35,7 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this)),
-    nextMemRespEvent([this] { processNextMemRespEvent(); }, name())
+    requestorId(system->getRequestorId(this))
 {}
 
 
@@ -77,9 +76,7 @@ bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
     memRespQueue.push(pkt);
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
-    }
+    scheduleMainEvent();
     return true;
 }
 
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index b0b05d9477..3436229aa1 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -79,13 +79,7 @@ class BaseEngine : public ClockedObject
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
 
-    /* All the classes inheriting from this class will
-    do their main processing in this function. For
-    example, BaseWLEngine reduces the temp_pro with
-    the value of update in this function.
-    */
-    EventFunctionWrapper nextMemRespEvent;
-    virtual void processNextMemRespEvent() = 0;
+    virtual void scheduleMainEvent() = 0;
 
   public:
     PARAMS(BaseEngine);
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 187eefe01b..a963cc9709 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -35,7 +35,8 @@ namespace gem5
 
 BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
     BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name())
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 bool
@@ -97,7 +98,7 @@ BasePushEngine::processNextReadEvent()
 }
 
 void
-BasePushEngine::processNextMemRespEvent()
+BasePushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
     RequestPtr req = pkt->req;
@@ -124,8 +125,16 @@ BasePushEngine::processNextMemRespEvent()
         }
     }
 
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextMemRespEvent, nextCycle());
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+BasePushEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
+        schedule(nextPushEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 0da4241dfd..8bb7d6663a 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -61,9 +61,12 @@ class BasePushEngine : public BaseEngine
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 20abaa7b20..ef66603de7 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -72,7 +72,7 @@ void BaseWLEngine::processNextWLReadEvent()
 }
 
 void
-BaseWLEngine::processNextMemRespEvent()
+BaseWLEngine::processNextWLReduceEvent()
 {
     PacketPtr resp = memRespQueue.front();
     uint8_t* respData = resp->getPtr<uint8_t>();
@@ -100,9 +100,18 @@ BaseWLEngine::processNextMemRespEvent()
     else {
         memRespQueue.pop();
     }
-    if (!nextMemRespEvent.scheduled() && !memRespQueue.empty()){
+    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
 }
 
+void
+BaseWLEngine::scheduleMainEvent()
+{
+    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
+        schedule(nextWLReduceEvent, nextCycle());
+    }
+}
+
+
 }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 3ca9a146a1..a5070f0b26 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void processNextMemRespEvent();
+    virtual void scheduleMainEvent();
 
   public:
 

From 235746cdf270f617df2c556e3a676d7f4d02b355 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 17:33:02 -0800
Subject: [PATCH 058/287] Fixed cycle in hierarchy and config. Sim starts.

---
 configs/accl/sega.py                     |  40 ++--
 src/accl/graph/base/BaseEngine.py        |   2 +-
 src/accl/graph/base/base_apply_engine.hh |   5 +-
 src/accl/graph/base/base_engine.cc       |  12 ++
 src/accl/graph/base/base_engine.hh       |   5 +-
 src/accl/graph/base/base_push_engine.hh  |   2 +-
 src/accl/graph/base/base_wl_engine.hh    |   2 +-
 src/accl/graph/sega/MPU.py               |  42 -----
 src/accl/graph/sega/PushEngine.py        |   2 +-
 src/accl/graph/sega/SConscript           |   2 -
 src/accl/graph/sega/WLEngine.py          |   1 +
 src/accl/graph/sega/apply_engine.hh      |   3 +-
 src/accl/graph/sega/mpu.cc               | 223 -----------------------
 src/accl/graph/sega/mpu.hh               | 127 -------------
 src/accl/graph/sega/push_engine.cc       |  49 ++++-
 src/accl/graph/sega/push_engine.hh       |  27 ++-
 src/accl/graph/sega/wl_engine.cc         |  88 +++++++++
 src/accl/graph/sega/wl_engine.hh         |  34 +++-
 18 files changed, 238 insertions(+), 428 deletions(-)
 delete mode 100644 src/accl/graph/sega/MPU.py
 delete mode 100644 src/accl/graph/sega/mpu.cc
 delete mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 288b1211e4..ea158ecdc9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,28 +1,46 @@
 import m5
 from m5.objects import *
 
-class PyMPU(MPU):
-    def __init__(self, clk_domain):
-        super().__init__()
-        self.clk_domain = clk_domain
-        self.apply_engine = ApplyEngine()
+class MPU(SubSystem):
+    def __init__(self):
+        super(MPU, self).__init__()
         self.push_engine = PushEngine()
-        self.wl_engine = WLEngine()
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.interconnect = SystemXBar()
 
-class SEGA(System):
+        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
+        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.push_engine.mem_port
+
+    def getRespPort(self):
+        return self.wl_engine.resp_port
+    def setRespPort(self, port):
+        self.wl_engine.resp_port = port
+
+    def getReqPort(self):
+        return self.push_engine.req_port
+    def setReqPort(self, port):
+        self.push_engine.req_port = port
 
+    def getMemPort(self):
+        return self.interconnect.mem_side_ports
+    def setMemPort(self, port):
+        self.interconnect.mem_side_ports = port
+
+class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-        # Set up the clock domain and the voltage domain
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = PyMPU(self.clk_domain)
+        self.mpu = MPU()
         self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-        self.mpu.memPort = self.mem_ctrl.port
-        self.mpu.reqPort = self.mpu.respPort
 
+        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.mpu.setMemPort(self.mem_ctrl.port)
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/BaseEngine.py
index 367df8dbc1..16c2f402e5 100644
--- a/src/accl/graph/base/BaseEngine.py
+++ b/src/accl/graph/base/BaseEngine.py
@@ -36,4 +36,4 @@ class BaseEngine(ClockedObject):
     cxx_class = 'gem5::BaseEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
-    memPort  = RequestPort("Port to communicate with the memory")
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 486fb687fe..9111bd074b 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -56,16 +56,13 @@ class BaseApplyEngine : public BaseEngine
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
 
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
     PARAMS(BaseApplyEngine);
 
     BaseApplyEngine(const BaseApplyEngineParams &apply);
 
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
     bool recvWLNotif(Addr addr);
 };
 
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 245192643c..6b40ba4137 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -38,6 +38,18 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     requestorId(system->getRequestorId(this))
 {}
 
+BaseEngine::~BaseEngine()
+{}
+
+Port&
+BaseEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
 
 void
 BaseEngine::MemPort::sendPacket(PacketPtr pkt)
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/base_engine.hh
index 3436229aa1..53415ddc7c 100644
--- a/src/accl/graph/base/base_engine.hh
+++ b/src/accl/graph/base/base_engine.hh
@@ -32,6 +32,7 @@
 #include <queue>
 #include <unordered_map>
 
+#include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
 #include "params/BaseEngine.hh"
@@ -78,6 +79,8 @@ class BaseEngine : public ClockedObject
 
     bool memPortBlocked() { return memPort.blocked(); }
     void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
     virtual void scheduleMainEvent() = 0;
 
@@ -85,7 +88,7 @@ class BaseEngine : public ClockedObject
     PARAMS(BaseEngine);
 
     BaseEngine(const BaseEngineParams &params);
-
+    ~BaseEngine();
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 };
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/base_push_engine.hh
index 8bb7d6663a..01027d2791 100644
--- a/src/accl/graph/base/base_push_engine.hh
+++ b/src/accl/graph/base/base_push_engine.hh
@@ -66,7 +66,7 @@ class BasePushEngine : public BaseEngine
 
   protected:
     virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index a5070f0b26..38079f8f94 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,7 +65,7 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
-    virtual void scheduleMainEvent();
+    virtual void scheduleMainEvent() override;
 
   public:
 
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
deleted file mode 100644
index 87de0fb7d6..0000000000
--- a/src/accl/graph/sega/MPU.py
+++ /dev/null
@@ -1,42 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-from m5.objects.WLEngine import WLEngine
-
-class MPU(ClockedObject):
-    type = 'MPU'
-    cxx_header = "accl/graph/sega/mpu.hh"
-    cxx_class = 'gem5::MPU'
-
-    work_list_engine = Param.WLEngine(NULL, "WLEngine to connect to this MPU")
-
-    respPort = ResponsePort("Port to Receive updates from outside")
-    reqPort  = RequestPort("Port to send updates to the outside")
-    memPort  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index eb0eed18ab..a743b57262 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,4 +34,4 @@ class PushEngine(BasePushEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    mpu = Param.MPU(Parent.any, "MPU object that owns this PushEngine")
+    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index dc19ece06b..f20d0e44df 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,11 +28,9 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
-SimObject('MPU.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
-Source('mpu.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 3bfe9fa16f..2d650ecb92 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,5 +34,6 @@ class WLEngine(BaseWLEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index aff2c5417b..1190786e36 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -50,7 +50,8 @@ class ApplyEngine : public BaseApplyEngine
     PushEngine* pushEngine;
 
   protected:
-    virtual bool sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex);
+    virtual bool sendApplyNotif(uint32_t prop,
+        uint32_t degree, uint32_t edgeIndex) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
deleted file mode 100644
index 9bda696cb5..0000000000
--- a/src/accl/graph/sega/mpu.cc
+++ /dev/null
@@ -1,223 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/mpu.hh"
-
-namespace gem5
-{
-
-MPU::MPU(const MPUParams &params):
-    ClockedObject(params),
-    respPort(name() + ".respPort", this),
-    reqPort(name() + ".reqPort", this),
-    memPort(name() + ".memPort", this),
-    wlEngine(params.work_list_engine)
-{}
-
-Port&
-MPU::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "respPort") {
-        return respPort;
-    } else if (if_name == "reqPort") {
-        return reqPort;
-    } else if (if_name == "memPort") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-MPU::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
-                                };
-    Edge edges [6] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-
-    for (int i = 0; i < 6; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-AddrRangeList
-MPU::MPURespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-MPU::MPURespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-MPU::MPURespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::MPURespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::MPURespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-MPU::MPUReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::MPUReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-MPU::MPUMemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-MPU::MPUMemPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on MPU::MPUMemPort memPort.");
-}
-
-void
-MPU::MPUMemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-AddrRangeList
-MPU::getAddrRanges()
-{
-    return memPort.getAddrRanges();
-}
-
-void
-MPU::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-MPU::handleWLUpdate(PacketPtr pkt)
-{
-    return wlEngine->handleWLUpdate(pkt);
-}
-
-bool
-MPU::recvPushUpdate(PacketPtr pkt)
-{
-    Addr addr = pkt->getAddr();
-    for (auto addr_range: memPort.getAddrRanges()) {
-        if (addr_range.contains(addr)) {
-            return handleWLUpdate(pkt);
-        }
-    }
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return true;
-}
-
-}
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
deleted file mode 100644
index a0472eead5..0000000000
--- a/src/accl/graph/sega/mpu.hh
+++ /dev/null
@@ -1,127 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
-#define __ACCL_GRAPH_SEGA_MPU_HH__
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
-#include "params/MPU.hh"
-#include "sim/clocked_object.hh"
-
-namespace gem5
-{
-
-class MPU : public ClockedObject
-{
-  private:
-    class MPURespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-
-      public:
-        MPURespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    class MPUReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUReqPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    class MPUMemPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        MPUMemPort(const std::string& name, MPU* owner):
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    MPURespPort respPort;
-    MPUReqPort reqPort;
-    MPUMemPort memPort;
-
-    WLEngine* wlEngine;
-
-    virtual void startup();
-    AddrRangeList getAddrRanges();
-    void recvFunctional(PacketPtr pkt);
-
-  public:
-    PARAMS(MPU);
-    MPU(const MPUParams &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    bool handleWLUpdate(PacketPtr pkt);
-    bool recvPushUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a1fa86da2b..c7b229ad33 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -33,13 +33,58 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params) :
     BasePushEngine(params),
-    mpu(params.mpu)
+    reqPort(name() + "reqPort", this)
 {}
 
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
-    return mpu->recvPushUpdate(pkt);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index edf698011d..604df4750d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,6 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_push_engine.hh"
-#include "accl/graph/sega/mpu.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -41,14 +40,36 @@ class MPU;
 class PushEngine : public BasePushEngine
 {
   private:
-    MPU* mpu;
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
 
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt);
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 823aa49bb9..e565ac119b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -33,12 +33,100 @@ namespace gem5
 
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
     applyEngine(params.apply_engine)
 {}
 
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {0, 0, 3, 0}, // Addr: 0
+                                {0, 0, 1, 3}, // Addr: 16
+                                {0, 0, 1, 4}, // Addr: 32
+                                {0, 0, 0, 5}, // Addr: 48
+                                {0, 0, 0, 5}  // Addr: 64
+                                };
+    Edge edges [6] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64}  // Addr: 1048640
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 6; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+}
+
 bool
 WLEngine::sendWLNotif(Addr addr){
     return applyEngine->recvWLNotif(addr);
 }
 
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->cmd == MemCmd::UpdateWL) {
+        panic("Functional requests should not be made to WL.");
+        //TODO: Might be a good idea to implement later.
+        // wlEngine->recvFunctional(pkt);
+    } else {
+        sendMemFunctional(pkt);
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 6946713aaa..f895a7ad32 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,14 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
-#include "base/addr_range.hh"
-#include "mem/port.hh"
-#include "mem/packet.hh"
 #include "params/WLEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-#include "sim/system.hh"
-
 
 namespace gem5
 {
@@ -51,14 +44,39 @@ class ApplyEngine;
 class WLEngine : public BaseWLEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
     ApplyEngine* applyEngine;
 
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
   protected:
-    virtual bool sendWLNotif(Addr addr);
+    virtual bool sendWLNotif(Addr addr) override;
 
   public:
     PARAMS(WLEngine);
     WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
 };
 
 }

From d66efdf5a3e2e2fc4d425ad2f80ab22da10a19a5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:23:54 -0800
Subject: [PATCH 059/287] Started fixing memory leak.

---
 src/accl/graph/base/base_apply_engine.cc | 6 +++---
 src/accl/graph/base/base_push_engine.cc  | 3 ++-
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e7b7dd6a22..7b643969df 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,9 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     Addr addr = applyReadQueue.front();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = (addr % 64);
-    RequestPtr request = std::make_shared<Request>(req_addr, 64, 0 ,0);
-    PacketPtr memPkt = new Packet(request, MemCmd::ReadReq);
-    requestOffset[request] = req_offset;
+
+    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+    requestOffset[memPkt->req] = req_offset;
     if (!memPortBlocked()) {
         sendMemReq(memPkt);
         applyReadQueue.pop();
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index a963cc9709..6e5aa05779 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -112,7 +112,8 @@ BasePushEngine::processNextPushEvent()
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
         Edge e = memoryToEdge(curr_edge_data);
-        uint32_t *update_data = new uint32_t;
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
 
         // TODO: Implement propagate function here
         *update_data = value + 1;

From df1340a91e5262a0d97faed7ffd39bf1e62af840 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Feb 2022 18:35:05 -0800
Subject: [PATCH 060/287] Adding newlines.

---
 configs/accl/sega.py                | 2 +-
 src/accl/graph/sega/apply_engine.cc | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ea158ecdc9..54970d356e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -49,4 +49,4 @@ def __init__(self):
 
 exit_event = m5.simulate()
 print("Simulation finished!")
-exit()
\ No newline at end of file
+exit()
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index bc3d703cf6..5d5f8daf26 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -42,4 +42,4 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
-}
\ No newline at end of file
+}

From ef0f9669a303035981a9ffc298b4acdf275d1ffc Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 24 Feb 2022 11:43:26 -0800
Subject: [PATCH 061/287] Removed the UpdateWL from the MemCmd.

---
 src/accl/graph/base/util.cc      |  3 ++-
 src/accl/graph/sega/wl_engine.cc | 13 +++++++------
 src/mem/packet.hh                |  2 +-
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
index 0baa374714..4172607ed0 100644
--- a/src/accl/graph/base/util.cc
+++ b/src/accl/graph/base/util.cc
@@ -133,7 +133,8 @@ getUpdatePacket(Addr addr, unsigned int size,
     // bits
     req->setPC(((Addr)requestorId) << 2);
 
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
     pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e565ac119b..f3c63e71f3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -120,13 +120,14 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    if (pkt->cmd == MemCmd::UpdateWL) {
-        panic("Functional requests should not be made to WL.");
-        //TODO: Might be a good idea to implement later.
-        // wlEngine->recvFunctional(pkt);
-    } else {
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
         sendMemFunctional(pkt);
-    }
+    // }
 }
 
 }
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 5332ee32a2..a67abbbbaa 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,7 +149,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        UpdateWL,
+        // UpdateWL,
         NUM_MEM_CMDS
     };
 

From acfffa3e25a866c6dc3aaa844ac195e530a44096 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 25 Feb 2022 11:49:51 -0800
Subject: [PATCH 062/287] Adding initial update. Fixing some bugs.

---
 src/accl/graph/base/base_wl_engine.cc | 2 +-
 src/accl/graph/sega/wl_engine.cc      | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index ef66603de7..1b9d92c1b4 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -62,7 +62,7 @@ void BaseWLEngine::processNextWLReadEvent()
     requestOffsetMap[memPkt->req] = req_offset;
     requestValueMap[memPkt->req] = value;
 
-    if (memPortBlocked()) {
+    if (!memPortBlocked()) {
         sendMemReq(memPkt);
         updateQueue.pop();
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f3c63e71f3..61bee38c05 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -80,6 +80,15 @@ WLEngine::startup()
                                         16, data, 0);
         sendMemFunctional(pkt);
     }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
 }
 
 bool

From 75825c3de944037f32c8b21d73106bcac77cbb00 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Feb 2022 13:35:24 -0800
Subject: [PATCH 063/287] Adding few debugging flags.

---
 src/accl/graph/base/SConscript           | 2 ++
 src/accl/graph/base/base_apply_engine.cc | 7 +++++++
 src/accl/graph/base/base_push_engine.cc  | 5 ++++-
 src/accl/graph/base/base_wl_engine.cc    | 6 ++++++
 src/accl/graph/sega/wl_engine.cc         | 2 +-
 5 files changed, 20 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 41c48fc419..c5c8c4e901 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -37,3 +37,5 @@ Source('base_engine.cc')
 Source('base_push_engine.cc')
 Source('base_wl_engine.cc')
 Source('util.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 7b643969df..5eb9d90059 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -31,6 +31,8 @@
 #include <string>
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
+
 
 namespace gem5
 {
@@ -83,6 +85,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
+                , __func__, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -101,6 +105,9 @@ BaseApplyEngine::processNextApplyEvent()
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
+                              "into WorkList Item: %s\n"
+                              , __func__, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index 6e5aa05779..f46941b8ed 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_push_engine.hh"
 
 #include "accl/graph/base/util.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
@@ -47,6 +48,7 @@ BasePushEngine::recvApplyNotif(uint32_t prop,
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
+    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
     return true;
 }
 
@@ -114,7 +116,6 @@ BasePushEngine::processNextPushEvent()
         Edge e = memoryToEdge(curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-
         // TODO: Implement propagate function here
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
@@ -122,6 +123,8 @@ BasePushEngine::processNextPushEvent()
             requestorId);
         if (sendPushUpdate(update)) {
             memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 1b9d92c1b4..38ebf0f35b 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -27,6 +27,7 @@
  */
 
 #include "accl/graph/base/base_wl_engine.hh"
+#include "debug/MPU.hh"
 
 #include <string>
 
@@ -80,6 +81,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
+                , __func__, wl.to_string());
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -89,10 +92,13 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
+                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
         }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 61bee38c05..674004d7a5 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 

From d3f342cab70cc838b254365789afe4947d6677bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:04:53 -0800
Subject: [PATCH 064/287] Adding lock_dir.

---
 configs/accl/sega.py                     |  6 +-
 src/accl/graph/base/base_apply_engine.cc | 19 ++++---
 src/accl/graph/base/base_apply_engine.hh |  3 +-
 src/accl/graph/base/base_wl_engine.cc    | 23 +++++---
 src/accl/graph/base/base_wl_engine.hh    |  2 +
 src/accl/graph/sega/ApplyEngine.py       |  1 +
 src/accl/graph/sega/LockDir.py           | 46 +++++++++++++++
 src/accl/graph/sega/SConscript           |  2 +
 src/accl/graph/sega/WLEngine.py          |  1 +
 src/accl/graph/sega/apply_engine.cc      | 15 ++++-
 src/accl/graph/sega/apply_engine.hh      |  4 ++
 src/accl/graph/sega/lock_dir.cc          | 71 ++++++++++++++++++++++++
 src/accl/graph/sega/lock_dir.hh          | 57 +++++++++++++++++++
 src/accl/graph/sega/wl_engine.cc         | 15 ++++-
 src/accl/graph/sega/wl_engine.hh         |  5 +-
 15 files changed, 248 insertions(+), 22 deletions(-)
 create mode 100644 src/accl/graph/sega/LockDir.py
 create mode 100644 src/accl/graph/sega/lock_dir.cc
 create mode 100644 src/accl/graph/sega/lock_dir.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54970d356e..db0bf4678f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,11 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
+        self.lock_dir = LockDirectory()
         self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine)
+        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
+        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
         self.interconnect = SystemXBar()
 
+
         self.interconnect.cpu_side_ports = self.wl_engine.mem_port
         self.interconnect.cpu_side_ports = self.apply_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 5eb9d90059..890d5dd313 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,14 +61,16 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = (addr % 64);
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffset[memPkt->req] = req_offset;
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        applyReadQueue.pop();
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffset[memPkt->req] = req_offset;
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            applyReadQueue.pop();
+        }
     }
     if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
         schedule(nextApplyCheckEvent, nextCycle());
@@ -113,6 +115,9 @@ BaseApplyEngine::processNextApplyEvent()
     } else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(pkt->getAddr())) {
+        panic("Could not release an address");
+    }
     if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
         schedule(nextApplyEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/base_apply_engine.hh
index 9111bd074b..f4df298079 100644
--- a/src/accl/graph/base/base_apply_engine.hh
+++ b/src/accl/graph/base/base_apply_engine.hh
@@ -55,7 +55,8 @@ class BaseApplyEngine : public BaseEngine
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
             uint32_t degree, uint32_t edgeIndex) = 0;
-
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 38ebf0f35b..7f1a27aae5 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,16 +56,18 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
+    if (acquireAddress(addr)) {
+        Addr req_addr = (addr / 64) * 64;
+        Addr req_offset = addr % 64;
 
-    PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-    requestOffsetMap[memPkt->req] = req_offset;
-    requestValueMap[memPkt->req] = value;
+        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
+        requestOffsetMap[memPkt->req] = req_offset;
+        requestValueMap[memPkt->req] = value;
 
-    if (!memPortBlocked()) {
-        sendMemReq(memPkt);
-        updateQueue.pop();
+        if (!memPortBlocked()) {
+            sendMemReq(memPkt);
+            updateQueue.pop();
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -92,7 +94,6 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
-
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
@@ -106,6 +107,10 @@ BaseWLEngine::processNextWLReduceEvent()
     else {
         memRespQueue.pop();
     }
+    if (!releaseAddress(resp->getAddr())) {
+        panic("Could not release an address");
+    }
+    std::cout << "success" << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/base_wl_engine.hh
index 38079f8f94..15371f965b 100644
--- a/src/accl/graph/base/base_wl_engine.hh
+++ b/src/accl/graph/base/base_wl_engine.hh
@@ -65,6 +65,8 @@ class BaseWLEngine : public BaseEngine
     */
   protected:
     virtual bool sendWLNotif(Addr addr) = 0;
+    virtual bool acquireAddress(Addr addr) = 0;
+    virtual bool releaseAddress(Addr addr) = 0;
     virtual void scheduleMainEvent() override;
 
   public:
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/ApplyEngine.py
index 5bb0dc0c25..7a446bb620 100644
--- a/src/accl/graph/sega/ApplyEngine.py
+++ b/src/accl/graph/sega/ApplyEngine.py
@@ -35,3 +35,4 @@ class ApplyEngine(BaseApplyEngine):
     cxx_class = 'gem5::ApplyEngine'
 
     push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/LockDir.py
new file mode 100644
index 0000000000..d21963dc3a
--- /dev/null
+++ b/src/accl/graph/sega/LockDir.py
@@ -0,0 +1,46 @@
+# Copyright (c) 2012-2014, 2017-2018 ARM Limited
+# All rights reserved.
+#
+# The license below extends only to copyright in the software and shall
+# not be construed as granting a license to any other intellectual
+# property including but not limited to intellectual property relating
+# to a hardware implementation of the functionality of the software
+# licensed hereunder.  You may use the software subject to the license
+# terms below provided that you ensure that this notice is replicated
+# unmodified and in its entirety in all distributions of the software,
+# modified or unmodified, in source code or in binary form.
+#
+# Copyright (c) 2007 The Regents of The University of Michigan
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class LockDirectory(SimObject):
+    type = 'LockDirectory'
+    cxx_header = 'accl/graph/sega/lock_dir.hh'
+    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f20d0e44df..e6d2f1fbbc 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -28,9 +28,11 @@
 Import('*')
 
 SimObject('ApplyEngine.py')
+SimObject('LockDir.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('apply_engine.cc')
+Source('lock_dir.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 2d650ecb92..b6e697266e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -37,3 +37,4 @@ class WLEngine(BaseWLEngine):
     resp_port = ResponsePort("Port to Receive updates from outside")
     apply_engine = Param.ApplyEngine(Parent.any,
             "MPU object that owns this WLEngine")
+    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/apply_engine.cc
index 5d5f8daf26..544bb082ad 100644
--- a/src/accl/graph/sega/apply_engine.cc
+++ b/src/accl/graph/sega/apply_engine.cc
@@ -32,7 +32,8 @@ namespace gem5{
 
 ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
     BaseApplyEngine(params),
-    pushEngine(params.push_engine)
+    pushEngine(params.push_engine),
+    lockDir(params.lock_dir)
 {}
 
 bool
@@ -42,4 +43,16 @@ ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
 
 }
 
+bool
+ApplyEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+ApplyEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/apply_engine.hh
index 1190786e36..c88330487a 100644
--- a/src/accl/graph/sega/apply_engine.hh
+++ b/src/accl/graph/sega/apply_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
@@ -48,10 +49,13 @@ class ApplyEngine : public BaseApplyEngine
 {
   private:
     PushEngine* pushEngine;
+    LockDirectory* lockDir;
 
   protected:
     virtual bool sendApplyNotif(uint32_t prop,
         uint32_t degree, uint32_t edgeIndex) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(ApplyEngine);
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
new file mode 100644
index 0000000000..b7efa638fe
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -0,0 +1,71 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/lock_dir.hh"
+
+namespace gem5
+{
+
+LockDirectory::LockDirectory(const LockDirectoryParams &params) :
+    SimObject(params)
+{}
+
+bool
+LockDirectory::acquire(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        lockOwnerMap[addr] = requestorId;
+        lockDegreeMap[addr] = 1;
+        return true;
+    } else if (lockOwnerMap[addr] == requestorId) {
+        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
+        return true;
+    } else {
+        return false;
+    }
+}
+
+bool
+LockDirectory::release(Addr addr, RequestorID requestorId)
+{
+    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
+        panic("Should not relase an address before acquiring");
+    } else if (lockOwnerMap[addr] != requestorId) {
+        panic("Should not release and address you don't own");
+    } else {
+        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
+        if (lockDegreeMap[addr] == 0) {
+            lockDegreeMap.erase(addr);
+            lockOwnerMap.erase(addr);
+            return true;
+        }
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
new file mode 100644
index 0000000000..64d934d42f
--- /dev/null
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
+
+#include <unordered_map>
+
+#include "mem/packet.hh"
+#include "params/LockDirectory.hh"
+#include "sim/sim_object.hh"
+
+namespace gem5
+{
+
+class LockDirectory: public SimObject
+{
+  private:
+    std::unordered_map<Addr, RequestorID> lockOwnerMap;
+    std::unordered_map<Addr, int> lockDegreeMap;
+
+  public:
+    PARAMS(LockDirectory);
+    LockDirectory(const LockDirectoryParams &params);
+
+    bool acquire(Addr addr, RequestorID requestorId);
+    bool release(Addr addr, RequestorID requestorId);
+};
+
+}
+
+#endif
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 674004d7a5..e557a08c18 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -34,7 +34,8 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseWLEngine(params),
     respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine)
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
 {}
 
 Port&
@@ -139,4 +140,16 @@ WLEngine::recvFunctional(PacketPtr pkt)
     // }
 }
 
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f895a7ad32..4e8a25795a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_wl_engine.hh"
 #include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -64,13 +65,15 @@ class WLEngine : public BaseWLEngine
 
     RespPort respPort;
     ApplyEngine* applyEngine;
-
+    LockDirectory* lockDir;
 
     virtual void startup();
     void recvFunctional(PacketPtr pkt);
 
   protected:
     virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
 
   public:
     PARAMS(WLEngine);

From eb63831b87d00aed4447daaa7855fd5641e6de3f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:42:20 -0800
Subject: [PATCH 065/287] Debugging

---
 src/accl/graph/base/base_wl_engine.cc |  6 +++---
 src/accl/graph/sega/wl_engine.cc      | 10 +++++-----
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 7f1a27aae5..f5d739da2d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -83,8 +83,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
+                , __func__, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -110,7 +110,7 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success" << std::endl;
+    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e557a08c18..a84ed2d52f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,11 +54,11 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {0, 0, 3, 0}, // Addr: 0
-                                {0, 0, 1, 3}, // Addr: 16
-                                {0, 0, 1, 4}, // Addr: 32
-                                {0, 0, 0, 5}, // Addr: 48
-                                {0, 0, 0, 5}  // Addr: 64
+                                {1000, 1000, 3, 0}, // Addr: 0
+                                {1000, 1000, 1, 3}, // Addr: 16
+                                {1000, 1000, 1, 4}, // Addr: 32
+                                {10000, 1000, 0, 5}, // Addr: 48
+                                {10000, 10000, 0, 5}  // Addr: 64
                                 };
     Edge edges [6] = {
                     {0, 16}, // Addr: 1048576

From 4d137d8c5389fb4dd28d4ca6a7e49df1184b9d9b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 15:53:38 -0800
Subject: [PATCH 066/287] More debugging.

---
 configs/accl/sega.py               |  3 ++-
 src/accl/graph/base/base_engine.cc |  3 +++
 src/accl/graph/sega/lock_dir.cc    | 12 ++----------
 src/accl/graph/sega/lock_dir.hh    |  2 +-
 4 files changed, 8 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db0bf4678f..db5a36b987 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -39,7 +39,8 @@ def __init__(self):
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
+        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
+        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index 6b40ba4137..f449e6ffdb 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -87,6 +87,9 @@ BaseEngine::MemPort::recvReqRetry()
 bool
 BaseEngine::handleMemResp(PacketPtr pkt)
 {
+    if (pkt->isResponse() && pkt->isWrite()) {
+        return true;
+    }
     memRespQueue.push(pkt);
     scheduleMainEvent();
     return true;
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/lock_dir.cc
index b7efa638fe..6a4496175d 100644
--- a/src/accl/graph/sega/lock_dir.cc
+++ b/src/accl/graph/sega/lock_dir.cc
@@ -40,10 +40,6 @@ LockDirectory::acquire(Addr addr, RequestorID requestorId)
 {
     if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
         lockOwnerMap[addr] = requestorId;
-        lockDegreeMap[addr] = 1;
-        return true;
-    } else if (lockOwnerMap[addr] == requestorId) {
-        lockDegreeMap[addr] = lockDegreeMap[addr] + 1;
         return true;
     } else {
         return false;
@@ -58,12 +54,8 @@ LockDirectory::release(Addr addr, RequestorID requestorId)
     } else if (lockOwnerMap[addr] != requestorId) {
         panic("Should not release and address you don't own");
     } else {
-        lockDegreeMap[addr] = lockDegreeMap[addr] - 1;
-        if (lockDegreeMap[addr] == 0) {
-            lockDegreeMap.erase(addr);
-            lockOwnerMap.erase(addr);
-            return true;
-        }
+        lockOwnerMap.erase(addr);
+        return true;
     }
     return false;
 }
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/lock_dir.hh
index 64d934d42f..012334ce43 100644
--- a/src/accl/graph/sega/lock_dir.hh
+++ b/src/accl/graph/sega/lock_dir.hh
@@ -42,7 +42,7 @@ class LockDirectory: public SimObject
 {
   private:
     std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    std::unordered_map<Addr, int> lockDegreeMap;
+    // std::unordered_map<Addr, int> lockDegreeMap;
 
   public:
     PARAMS(LockDirectory);

From efcbae85fd36cae6477f1aa66b802f078ef87e2f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 28 Feb 2022 16:34:09 -0800
Subject: [PATCH 067/287] Fixed the bugs. Simulation is an endless loop.

---
 configs/accl/sega.py                     |  2 +-
 src/accl/graph/base/base_apply_engine.cc |  7 +++----
 src/accl/graph/base/base_engine.cc       |  6 ++++--
 src/accl/graph/base/base_push_engine.cc  |  2 +-
 src/accl/graph/base/base_wl_engine.cc    | 10 ++++------
 5 files changed, 13 insertions(+), 14 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index db5a36b987..163ea169d9 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -50,6 +50,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate()
+exit_event = m5.simulate(1000000)
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index 890d5dd313..e222cb5a76 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -61,10 +61,9 @@ BaseApplyEngine::processNextApplyCheckEvent()
     // pops items off queue, maybe we should pop every n cycles
     // or change the clock domain for this simobject.
     Addr addr = applyReadQueue.front();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = (addr % 64);
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = (addr % 64);
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffset[memPkt->req] = req_offset;
         if (!memPortBlocked()) {
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/base_engine.cc
index f449e6ffdb..ad87bb3662 100644
--- a/src/accl/graph/base/base_engine.cc
+++ b/src/accl/graph/base/base_engine.cc
@@ -27,7 +27,7 @@
  */
 
 #include "accl/graph/base/base_engine.hh"
-
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -36,7 +36,9 @@ BaseEngine::BaseEngine(const BaseEngineParams &params) :
     system(params.system),
     memPort(name() + ".memPort", this),
     requestorId(system->getRequestorId(this))
-{}
+{
+    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
+}
 
 BaseEngine::~BaseEngine()
 {}
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/base_push_engine.cc
index f46941b8ed..4ebe40e486 100644
--- a/src/accl/graph/base/base_push_engine.cc
+++ b/src/accl/graph/base/base_push_engine.cc
@@ -121,7 +121,7 @@ BasePushEngine::processNextPushEvent()
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             requestorId);
-        if (sendPushUpdate(update)) {
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
                 , __func__, e.to_string(), *update_data);
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index f5d739da2d..921e9c683d 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -56,10 +56,9 @@ void BaseWLEngine::processNextWLReadEvent()
     uint32_t value = *(pkt->getPtr<uint32_t>());
 
     Addr addr = pkt->getAddr();
-    if (acquireAddress(addr)) {
-        Addr req_addr = (addr / 64) * 64;
-        Addr req_offset = addr % 64;
-
+    Addr req_addr = (addr / 64) * 64;
+    Addr req_offset = addr % 64;
+    if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
         requestValueMap[memPkt->req] = value;
@@ -98,7 +97,7 @@ BaseWLEngine::processNextWLReduceEvent()
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is chanching to: %s\n"
+                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
                 , __func__, wl.to_string());
                 // TODO: Erase map entries, delete wlData;
             }
@@ -110,7 +109,6 @@ BaseWLEngine::processNextWLReduceEvent()
     if (!releaseAddress(resp->getAddr())) {
         panic("Could not release an address");
     }
-    std::cout << "success "<<  memRespQueue.size() << std::endl;
     if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
             schedule(nextWLReduceEvent, nextCycle());
     }

From f0dadbb9eea953ca1b69cca3e7bbc3dd994d87e3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 28 Feb 2022 18:34:18 -0800
Subject: [PATCH 068/287] Debugged: Releases the address when the memory is
 blocked. Added debugging flgs for validation.

---
 src/accl/graph/base/base_apply_engine.cc | 14 ++++++---
 src/accl/graph/base/base_wl_engine.cc    | 12 ++++++--
 src/accl/graph/sega/wl_engine.cc         | 17 ++++++-----
 src/mem/packet.cc                        | 39 ++++++++++++++++++++++++
 src/mem/packet.hh                        |  2 ++
 5 files changed, 69 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/base_apply_engine.cc
index e222cb5a76..39f5dafc67 100644
--- a/src/accl/graph/base/base_apply_engine.cc
+++ b/src/accl/graph/base/base_apply_engine.cc
@@ -86,8 +86,8 @@ BaseApplyEngine::processNextApplyEvent()
     Addr request_offset = requestOffset[request];
 
     WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item: %s\n"
-                , __func__, wl.to_string());
+    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
+                , __func__, pkt->getAddr() + request_offset, wl.to_string());
     // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
     // to applyengine if temp_prop < prop. If temp_prop has not changed, why
     // fwd it to applyengine?
@@ -102,13 +102,17 @@ BaseApplyEngine::processNextApplyEvent()
         PacketPtr writePkt  =
         getWritePacket(pkt->getAddr(), 64, data, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
+
         if (!memPortBlocked()) {
             if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
                 sendMemReq(writePkt);
                 memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value",
-                              "into WorkList Item: %s\n"
-                              , __func__, wl.to_string());
+                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
+                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
             }
         }
     } else {
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/base_wl_engine.cc
index 921e9c683d..fd45b85077 100644
--- a/src/accl/graph/base/base_wl_engine.cc
+++ b/src/accl/graph/base/base_wl_engine.cc
@@ -58,6 +58,7 @@ void BaseWLEngine::processNextWLReadEvent()
     Addr addr = pkt->getAddr();
     Addr req_addr = (addr / 64) * 64;
     Addr req_offset = addr % 64;
+
     if (acquireAddress(req_addr)) {
         PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
         requestOffsetMap[memPkt->req] = req_offset;
@@ -67,6 +68,9 @@ void BaseWLEngine::processNextWLReadEvent()
             sendMemReq(memPkt);
             updateQueue.pop();
         }
+        else{
+            releaseAddress(req_addr);
+        }
     }
     if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
         schedule(nextWLReadEvent, nextCycle());
@@ -82,8 +86,8 @@ BaseWLEngine::processNextWLReduceEvent()
     uint32_t value = requestValueMap[resp->req];
     WorkListItem wl =  memoryToWorkList(respData + request_offset);
 
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item: %s %d\n"
-                , __func__, wl.to_string(), value);
+    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
+                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
     if (value < wl.temp_prop){
         //update prop with temp_prop
         wl.temp_prop = value;
@@ -93,6 +97,10 @@ BaseWLEngine::processNextWLReduceEvent()
         PacketPtr writePkt  =
         getWritePacket(resp->getAddr(), 64, respData, requestorId);
 
+        DPRINTF(MPU, "%s: Sending a pkt with this info. "
+                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
+                __func__, writePkt->getAddr(),
+                writePkt->getSize(), writePkt->printData());
         if (!memPortBlocked()) {
             if (sendWLNotif(resp->getAddr() + request_offset)) {
                 sendMemReq(writePkt);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a84ed2d52f..03f74f1019 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -54,18 +54,19 @@ WLEngine::startup()
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
     WorkListItem vertices [5] = {
-                                {1000, 1000, 3, 0}, // Addr: 0
-                                {1000, 1000, 1, 3}, // Addr: 16
-                                {1000, 1000, 1, 4}, // Addr: 32
-                                {10000, 1000, 0, 5}, // Addr: 48
-                                {10000, 10000, 0, 5}  // Addr: 64
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
                                 };
-    Edge edges [6] = {
+    Edge edges [7] = {
                     {0, 16}, // Addr: 1048576
                     {0, 32}, // Addr: 1048592
                     {0, 48}, // Addr: 1048608
                     {0, 32}, // Addr: 1048624
-                    {0, 64}  // Addr: 1048640
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
                     };
 
     for (int i = 0; i < 5; i++) {
@@ -75,7 +76,7 @@ WLEngine::startup()
         sendMemFunctional(pkt);
     }
 
-    for (int i = 0; i < 6; i++) {
+    for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index 31dc330cab..da45246e49 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -532,4 +532,43 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
+std::string
+Packet::printData()
+{
+    char ret[1024];
+    if (isWrite()) {
+        uint8_t* data = getPtr<uint8_t>();
+        std::sprintf(ret,"\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n"
+                        "V[%lu] temp_prop: %u, prop: %u, "
+                        "degree: %u, edgeIndex: %u.\n",
+                        getAddr(),
+                        *((uint32_t*) data),
+                        *((uint32_t*) (data + 4)),
+                        *((uint32_t*) (data + 8)),
+                        *((uint32_t*) (data + 12)),
+                        getAddr() + 16,
+                        *((uint32_t*) (data + 16)),
+                        *((uint32_t*) (data + 20)),
+                        *((uint32_t*) (data + 24)),
+                        *((uint32_t*) (data + 28)),
+                        getAddr() + 32,
+                        *((uint32_t*) (data + 32)),
+                        *((uint32_t*) (data + 36)),
+                        *((uint32_t*) (data + 40)),
+                        *((uint32_t*) (data + 44)),
+                        getAddr() + 48,
+                        *((uint32_t*) (data + 48)),
+                        *((uint32_t*) (data + 52)),
+                        *((uint32_t*) (data + 56)),
+                        *((uint32_t*) (data + 60)));
+    }
+    return ret;
+}
+
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index a67abbbbaa..8803eacced 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -1374,6 +1374,8 @@ class Packet : public Printable
     template <typename T>
     void setRaw(T v);
 
+    std::string printData();
+
   public:
     /**
      * Check a functional request against a memory value stored in

From b1a59999867d57af5d5083da4f3044ee785f6ad7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 22 Mar 2022 01:24:54 -0700
Subject: [PATCH 069/287] Adding coalescer to the code.

---
 src/accl/graph/base/BaseReadEngine.py         |  39 ++++
 src/accl/graph/base/BaseReduceEngine.py       |  38 ++++
 src/accl/graph/base/base_read_engine.cc       |  86 ++++++++
 src/accl/graph/base/base_read_engine.hh       | 101 ++++++++++
 src/accl/graph/base/base_reduce_engine.cc     |  51 +++++
 src/accl/graph/base/base_reduce_engine.hh     |  67 +++++++
 .../graph/base/{ => old}/BaseApplyEngine.py   |   0
 src/accl/graph/base/{ => old}/BaseEngine.py   |   0
 .../graph/base/{ => old}/BasePushEngine.py    |   0
 src/accl/graph/base/{ => old}/BaseWLEngine.py |   0
 .../graph/base/{ => old}/base_apply_engine.cc |   0
 .../graph/base/{ => old}/base_apply_engine.hh |   0
 src/accl/graph/base/{ => old}/base_engine.cc  |   0
 src/accl/graph/base/{ => old}/base_engine.hh  |   0
 .../graph/base/{ => old}/base_push_engine.cc  |   0
 .../graph/base/{ => old}/base_push_engine.hh  |   0
 .../graph/base/{ => old}/base_wl_engine.cc    |   0
 .../graph/base/{ => old}/base_wl_engine.hh    |   0
 src/accl/graph/sega/coalesce_engine.cc        | 187 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.hh        |  88 +++++++++
 src/accl/graph/sega/{ => old}/ApplyEngine.py  |   0
 src/accl/graph/sega/{ => old}/LockDir.py      |   0
 src/accl/graph/sega/{ => old}/PushEngine.py   |   0
 src/accl/graph/sega/{ => old}/WLEngine.py     |   0
 src/accl/graph/sega/{ => old}/apply_engine.cc |   0
 src/accl/graph/sega/{ => old}/apply_engine.hh |   0
 src/accl/graph/sega/{ => old}/lock_dir.cc     |   0
 src/accl/graph/sega/{ => old}/lock_dir.hh     |   0
 src/accl/graph/sega/old/push_engine.cc        |  90 +++++++++
 src/accl/graph/sega/old/push_engine.hh        |  77 ++++++++
 src/accl/graph/sega/old/wl_engine.cc          | 156 +++++++++++++++
 src/accl/graph/sega/old/wl_engine.hh          |  86 ++++++++
 src/accl/graph/sega/push_engine.cc            | 144 +++++++++++++-
 src/accl/graph/sega/push_engine.hh            |  32 ++-
 src/accl/graph/sega/wl_engine.cc              | 109 +++++++---
 src/accl/graph/sega/wl_engine.hh              |  37 ++--
 36 files changed, 1338 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/base/BaseReadEngine.py
 create mode 100644 src/accl/graph/base/BaseReduceEngine.py
 create mode 100644 src/accl/graph/base/base_read_engine.cc
 create mode 100644 src/accl/graph/base/base_read_engine.hh
 create mode 100644 src/accl/graph/base/base_reduce_engine.cc
 create mode 100644 src/accl/graph/base/base_reduce_engine.hh
 rename src/accl/graph/base/{ => old}/BaseApplyEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BasePushEngine.py (100%)
 rename src/accl/graph/base/{ => old}/BaseWLEngine.py (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_apply_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_push_engine.hh (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.cc (100%)
 rename src/accl/graph/base/{ => old}/base_wl_engine.hh (100%)
 create mode 100644 src/accl/graph/sega/coalesce_engine.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine.hh
 rename src/accl/graph/sega/{ => old}/ApplyEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/LockDir.py (100%)
 rename src/accl/graph/sega/{ => old}/PushEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/WLEngine.py (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.cc (100%)
 rename src/accl/graph/sega/{ => old}/apply_engine.hh (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.cc (100%)
 rename src/accl/graph/sega/{ => old}/lock_dir.hh (100%)
 create mode 100644 src/accl/graph/sega/old/push_engine.cc
 create mode 100644 src/accl/graph/sega/old/push_engine.hh
 create mode 100644 src/accl/graph/sega/old/wl_engine.cc
 create mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
new file mode 100644
index 0000000000..84c53465b9
--- /dev/null
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReadEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReadEngine'
+    cxx_header = "accl/graph/base/base_read_engine.hh"
+    cxx_class = 'gem5::BaseReadEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/BaseReduceEngine.py b/src/accl/graph/base/BaseReduceEngine.py
new file mode 100644
index 0000000000..0585c36e48
--- /dev/null
+++ b/src/accl/graph/base/BaseReduceEngine.py
@@ -0,0 +1,38 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseReduceEngine(ClockedObject):
+    abstract = True
+    type = 'BaseReduceEngine'
+    cxx_header = "accl/graph/base/base_reduce_engine.hh"
+    cxx_class = 'gem5::BaseReduceEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
new file mode 100644
index 0000000000..4192cdb565
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    memPort(name() + ".mem_port", this),
+    _requestorId(system.getRequestorId(this)),
+{}
+
+BaseReadEngine::~BaseReadEngine()
+{}
+
+Port&
+BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    //TODO: Investigate sending true all the time
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseReadEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
new file mode 100644
index 0000000000..99f14bcb06
--- /dev/null
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReadEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseReadEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    MemPort memPort;
+
+    bool handleMemResp(PacketPtr resp);
+
+  protected:
+    const RequestorID _requestorId;
+
+    bool memPortBlocked() { return memPort.blocked(); }
+    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+  public:
+    PARAMS(BaseReadEngine);
+
+    BaseReadEngine(const BaseReadEngineParams &params);
+    ~BaseReadEngine();
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    RequestorID requestorId() { return _requestorId; }
+
+    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt);
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
new file mode 100644
index 0000000000..fbfc613313
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -0,0 +1,51 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/base/base_reduce_engine.hh"
+
+namespace gem5
+{
+
+BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this))
+{}
+
+BaseReduceEngine::~BaseReduceEngine()
+{}
+
+void
+BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    currentWorkListAddress = addr;
+    currentWorkList = wl;
+    scheduleReduceEvent();
+}
+
+}
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
new file mode 100644
index 0000000000..e44f384f26
--- /dev/null
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -0,0 +1,67 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+
+
+#include "accl/base/util.hh"
+#include "params/BaseReduceEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseReduceEngine : public ClockedObject
+{
+  private:
+    System* system;
+
+    bool handleIncomingWL(Addr addr, WorkListItem wl);
+
+  protected:
+    Addr currentWorkListAddress;
+    WorkListItem currentWorkList;
+
+    const RequestorID _requestorId;
+
+    virtual void scheduleReduceEvent() = 0;
+
+  public:
+    PARAMS(BaseReduceEngine);
+
+    BaseReduceEngine(const BaseReduceEngineParams &params);
+    ~BaseReduceEngine();
+
+    RequestorID requestorId() { return _requestorId; }
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
diff --git a/src/accl/graph/base/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseApplyEngine.py
rename to src/accl/graph/base/old/BaseApplyEngine.py
diff --git a/src/accl/graph/base/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseEngine.py
rename to src/accl/graph/base/old/BaseEngine.py
diff --git a/src/accl/graph/base/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
similarity index 100%
rename from src/accl/graph/base/BasePushEngine.py
rename to src/accl/graph/base/old/BasePushEngine.py
diff --git a/src/accl/graph/base/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
similarity index 100%
rename from src/accl/graph/base/BaseWLEngine.py
rename to src/accl/graph/base/old/BaseWLEngine.py
diff --git a/src/accl/graph/base/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.cc
rename to src/accl/graph/base/old/base_apply_engine.cc
diff --git a/src/accl/graph/base/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_apply_engine.hh
rename to src/accl/graph/base/old/base_apply_engine.hh
diff --git a/src/accl/graph/base/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_engine.cc
rename to src/accl/graph/base/old/base_engine.cc
diff --git a/src/accl/graph/base/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_engine.hh
rename to src/accl/graph/base/old/base_engine.hh
diff --git a/src/accl/graph/base/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_push_engine.cc
rename to src/accl/graph/base/old/base_push_engine.cc
diff --git a/src/accl/graph/base/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_push_engine.hh
rename to src/accl/graph/base/old/base_push_engine.hh
diff --git a/src/accl/graph/base/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.cc
rename to src/accl/graph/base/old/base_wl_engine.cc
diff --git a/src/accl/graph/base/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
similarity index 100%
rename from src/accl/graph/base/base_wl_engine.hh
rename to src/accl/graph/base/old/base_wl_engine.hh
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
new file mode 100644
index 0000000000..1f7a94dc7e
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -0,0 +1,187 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/sega/coalesce_engine.hh"
+
+#include "accl/sega/wl_engine.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
+    BaseReadEngine(params),
+    reqQueueSize(params.req_queue_size),
+    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+{}
+
+CoalesceEngine::~CoalesceEngine()
+{}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    sendMemFunctional(pkt);
+}
+
+void
+CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+{
+    peerWLEngine = wl_engine;
+}
+
+bool
+CoalesceEngine::recvReadAddr(Addr addr)
+{
+    assert(reqQueue.size() <= reqQueueSize);
+    if (reqQueue.size() == reqQueueSize) {
+        return false;
+    }
+
+    reqQueue.push(addr);
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    // TODO: Investigate this for optimization
+    Addr addr = reqQueue.front();
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    if (cacheBlocks[block_index].allocated) {
+        // Hit
+        // TODO: I guess this piece of code code could be optimized.
+        // Not the code per se. The design it represents.
+        if (cacheBlocks[block_index].addr == alligned_addr) {
+            if (!cacheBlocks[block_index].taken[wl_offset]) {
+                if (cacheBlocks[block_index].valid) {
+                    peerWLEngine->handleIncomingWL(addr,
+                        cacheBlocks[block_index].items[wl_offset]);
+                    cacheBlocks[block_index].taken[wl_offset] = true;
+                } else {
+                    cacheBlocks[block_index].pending[wl_offset] = true;
+                }
+                reqQueue.pop();
+            }
+        } else { // conflict
+            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
+            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
+                cacheBlocks[block_index].numConflicts += 1;
+                conflictAddrQueue.push(addr);
+                reqQueue.pop();
+            }
+        }
+    } else {
+        // miss
+        cacheBlocks[block_index].addr = alligned_addr;
+        cacheBlocks[block_index].numConflicts = 0;
+        cacheBlocks[block_index].pending = {false, false, false, false};
+        cacheBlocks[block_index].pending[wl_offset] = true;
+        cacheBlocks[block_index].taken = {false, false, false, false};
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].allocated = true;
+
+        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
+
+        if (!memPortBlocked()) {
+            sendMemReq(pkt);
+            reqQueue.pop();
+        }
+    }
+
+    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
+    }
+}
+
+/*
+    void recvWLWrite(Addr addr, WorkListItem wl);
+*/
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    if (pkt->isResp() && pkt->isWrite()) {
+        return true;
+    }
+
+    Addr addr = pkt->getAddr();
+    uint8_t data = pkt->getPtr<uint8_t>();
+
+    int block_index = addr % 256;
+    cacheBlocks[block_index].valid = true;
+
+    for (i = 0; i < 4; i++) {
+        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].taken[i] = false;
+        if (cacheBlocks[block_index].pending[i]) {
+            peerWLEngine->handleIncomingWL(addr + (i * 16),
+                cacheBlocks[block_index].items[i]);
+            cacheBlocks[block_index].taken[i] = true;
+        }
+        cacheBlocks[block_index].pending = false;
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr alligned_addr = (addr / 64) * 64;
+    int block_index = alligned_addr % 256;
+    int wl_offset = (addr - alligned_addr) / 16;
+
+    assert(cacheBlocks[block_index].taken[wl_offset]);
+    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].taken[wl_offset] = false;
+
+    bool taken_item = false;
+    taken_item &= (cacheBlocks[block_index].taken[0] &
+                    cacheBlocks[block_index].taken[1] &
+                    cacheBlocks[block_index].taken[2] &
+                    cacheBlocks[block_index].taken[3]);
+
+    if (!taken_item) {
+        for (auto conflictAddr : conflictAddrQueue) {
+            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
+            if (conflict_block_index == block_index) {
+                // Evict cacheBlocks[block_index]
+                // Respond to conflictAddr
+            }
+        }
+    }
+
+}
+
+}
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
new file mode 100644
index 0000000000..0b349b2c1a
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include "accl/base/base_read_engine.hh"
+
+namespace gem5
+{
+
+class WLEngine;
+
+class CoalesceEngine : public BaseReadEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem items[4];
+        Addr addr;
+        int numConflicts;
+        bool pending[4];
+        bool taken[4];
+        bool valid;
+        bool allocated;
+    };
+
+    WLEngine* peerWLEngine;
+
+    Block cacheBlocks[256];
+
+    int reqQueueSize;
+    std::queue<Addr> reqQueue;
+
+    int conflictAddrQueueSize;
+    std::queue<Addr> conflictAddrQueue;
+
+    EventFunctionWrapper nextRespondEvent;
+    void processNextRespondEvent();
+
+    EventFunctionWrapper nextApplyAndCommitEvent;
+    void processNextApplyAndCommitEvent();
+
+  protected:
+    virtual bool handleMemResp(PacketPtr pkt);
+
+  public:
+    PARAMS(CoalesceEngine);
+
+    CoalesceEngine(const CoalesceEngineParams &params);
+    ~CoalesceEngine();
+
+    void recvFunctional(PacketPtr pkt);
+
+    bool recvReadAddr(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    void registerWLEngine(WLEngine* wl_engine);
+}
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
similarity index 100%
rename from src/accl/graph/sega/ApplyEngine.py
rename to src/accl/graph/sega/old/ApplyEngine.py
diff --git a/src/accl/graph/sega/LockDir.py b/src/accl/graph/sega/old/LockDir.py
similarity index 100%
rename from src/accl/graph/sega/LockDir.py
rename to src/accl/graph/sega/old/LockDir.py
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
similarity index 100%
rename from src/accl/graph/sega/PushEngine.py
rename to src/accl/graph/sega/old/PushEngine.py
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
similarity index 100%
rename from src/accl/graph/sega/WLEngine.py
rename to src/accl/graph/sega/old/WLEngine.py
diff --git a/src/accl/graph/sega/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
similarity index 100%
rename from src/accl/graph/sega/apply_engine.cc
rename to src/accl/graph/sega/old/apply_engine.cc
diff --git a/src/accl/graph/sega/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
similarity index 100%
rename from src/accl/graph/sega/apply_engine.hh
rename to src/accl/graph/sega/old/apply_engine.hh
diff --git a/src/accl/graph/sega/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
similarity index 100%
rename from src/accl/graph/sega/lock_dir.cc
rename to src/accl/graph/sega/old/lock_dir.cc
diff --git a/src/accl/graph/sega/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
similarity index 100%
rename from src/accl/graph/sega/lock_dir.hh
rename to src/accl/graph/sega/old/lock_dir.hh
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
new file mode 100644
index 0000000000..c7b229ad33
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.cc
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/push_engine.hh"
+
+namespace gem5
+{
+
+PushEngine::PushEngine(const PushEngineParams &params) :
+    BasePushEngine(params),
+    reqPort(name() + "reqPort", this)
+{}
+
+Port&
+PushEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return BasePushEngine::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
+}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
new file mode 100644
index 0000000000..604df4750d
--- /dev/null
+++ b/src/accl/graph/sega/old/push_engine.hh
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
+
+#include "accl/graph/base/base_push_engine.hh"
+#include "params/PushEngine.hh"
+
+namespace gem5
+{
+
+class MPU;
+
+class PushEngine : public BasePushEngine
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+  protected:
+    virtual bool sendPushUpdate(PacketPtr pkt) override;
+
+  public:
+    PARAMS(PushEngine);
+    PushEngine(const PushEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
new file mode 100644
index 0000000000..03f74f1019
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.cc
@@ -0,0 +1,156 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/wl_engine.hh"
+#include "debug/MPU.hh"
+namespace gem5
+{
+
+WLEngine::WLEngine(const WLEngineParams &params):
+    BaseWLEngine(params),
+    respPort(name() + ".respPort", this),
+    applyEngine(params.apply_engine),
+    lockDir(params.lock_dir)
+{}
+
+Port&
+WLEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "resp_port") {
+        return respPort;
+    } else {
+        return BaseWLEngine::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::startup()
+{
+    //FIXME: This is the current version of our initializer.
+    // This should be updated in the future.
+    WorkListItem vertices [5] = {
+                                {10000, 10000, 3, 0}, // Addr: 0
+                                {10000, 10000, 1, 3}, // Addr: 16
+                                {10000, 10000, 1, 4}, // Addr: 32
+                                {10000, 10000, 1, 5}, // Addr: 48
+                                {10000, 10000, 0, 6}  // Addr: 64
+                                };
+    Edge edges [7] = {
+                    {0, 16}, // Addr: 1048576
+                    {0, 32}, // Addr: 1048592
+                    {0, 48}, // Addr: 1048608
+                    {0, 32}, // Addr: 1048624
+                    {0, 64},  // Addr: 1048640
+                    {0, 32}
+                    };
+
+    for (int i = 0; i < 5; i++) {
+        uint8_t* data = workListToMemory(vertices[i]);
+        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    for (int i = 0; i < 7; i++) {
+        uint8_t* data = edgeToMemory(edges[i]);
+        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
+                                        16, data, 0);
+        sendMemFunctional(pkt);
+    }
+
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = getUpdatePacket(
+        0, 4, first_update_data, requestorId);
+
+    handleWLUpdate(first_update);
+}
+
+bool
+WLEngine::sendWLNotif(Addr addr){
+    return applyEngine->recvWLNotif(addr);
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    return owner->handleWLUpdate(pkt);
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::recvFunctional(PacketPtr pkt)
+{
+    // FIXME: This needs to be fixed
+    // if (pkt->cmd == MemCmd::UpdateWL) {
+    //     panic("Functional requests should not be made to WL.");
+    //     //TODO: Might be a good idea to implement later.
+    //     // wlEngine->recvFunctional(pkt);
+    // } else {
+        sendMemFunctional(pkt);
+    // }
+}
+
+bool
+WLEngine::acquireAddress(Addr addr)
+{
+    return lockDir->acquire(addr, requestorId);
+}
+
+bool
+WLEngine::releaseAddress(Addr addr)
+{
+    return lockDir->release(addr, requestorId);
+}
+
+}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
new file mode 100644
index 0000000000..4e8a25795a
--- /dev/null
+++ b/src/accl/graph/sega/old/wl_engine.hh
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
+
+#include <queue>
+#include <unordered_map>
+
+#include "accl/graph/base/base_wl_engine.hh"
+#include "accl/graph/sega/apply_engine.hh"
+#include "accl/graph/sega/lock_dir.hh"
+#include "params/WLEngine.hh"
+
+namespace gem5
+{
+
+class ApplyEngine;
+
+class WLEngine : public BaseWLEngine
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner):
+          ResponsePort(name, owner), owner(owner)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    RespPort respPort;
+    ApplyEngine* applyEngine;
+    LockDirectory* lockDir;
+
+    virtual void startup();
+    void recvFunctional(PacketPtr pkt);
+
+  protected:
+    virtual bool sendWLNotif(Addr addr) override;
+    virtual bool acquireAddress(Addr addr) override;
+    virtual bool releaseAddress(Addr addr) override;
+
+  public:
+    PARAMS(WLEngine);
+    WLEngine(const WLEngineParams &params);
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+};
+
+}
+#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c7b229ad33..c865451999 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,9 +31,16 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
+PushEngine::PushEngine(const PushEngineParams &params):
+    BaseReadEngine(params),
+    reqPort(name() + ".req_port", this),
+    baseEdgeAddr(params.base_edge_addr),
+    memRespQueueSize(params.mem_resp_queue_size),
+    pushReqQueueSize(params.push_req_queue_size),
+    onTheFlyReadReqs(0),
+    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    nextReadEvent([this] { processNextReadEvent(); }, name()),
+    nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
 Port&
@@ -41,8 +48,10 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
+    } else if (if_name == "mem_port") {
+        return BaseReadEngine::getPort(if_name, idx);
     } else {
-        return BasePushEngine::getPort(if_name, idx);
+        return SimObject::getPort(if_name, idx);
     }
 }
 
@@ -78,13 +87,130 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
+PushEngine::recvWLItem(WorkListItem wl);
 {
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
+    assert(pushReqQueue.size() <= pushReqQueueSize);
+    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
+        return false;
+    }
+    pushReqQueue.push(wl);
+
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+    return true;
+}
+
+void
+PushEngine::processNextAddrGenEvent()
+{
+    WorkListItem wl = pushReqQueue.front();
+
+    std::vector<Addr> addr_queue;
+    std::vector<Addr> offset_queue;
+    std::vector<int> num_edge_queue;
+
+    for (uint32_t index = 0; index < wl.degree; index++) {
+        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
+        Addr req_addr = (edge_addr / 64) * 64;
+        Addr req_offset = edge_addr % 64;
+        if (addr_queue.size()) {
+            if (addr_queue.back() == req_addr) {
+                num_edge_queue.back()++;
+            }
+            else {
+                addr_queue.push_back(req_addr);
+                offset_queue.push_back(req_offset);
+                num_edge_queue.push_back(1);
+            }
+        }
+        else {
+            addr_queue.push_back(req_addr);
+            offset_queue.push_back(req_offset);
+            num_edge_queue.push_back(1);
+        }
+    };
+
+    for (int index = 0; index < addr_queue.size(); index++) {
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        reqOffsetMap[pkt->req] = offset_queue[index];
+        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
+        reqValueMap[pkt->req] = wl.prop;
+        pendingReadReqs.push(pkt);
+    }
+
+    pushReadReqs.pop();
+
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextReadEvent()
+{
+    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
+        (!memPortBlocked())) {
+        PacketPtr pkt = pendingReadReqs.front();
+        sendMemReq(pkt);
+        onTheFlyReadReqs++;
+        pendingReadReqs.pop();
+    }
+
+    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+bool
+PushEngine::handleMemResp(PacketPtr pkt)
+{
+    onTheFlyReadReqs--;
+    memRespQueue.push(pkt);
+
+    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
+void
+PushEngine::processNextPushEvent()
+{
+    PacketPtr pkt = memRespQueue.front();
+    RequestPtr req = pkt->req;
+    uint8_t *data = pkt->getPtr<uint8_t>();
+
+    Addr offset = reqOffsetMap[req];
+    int num_edges = reqNumEdgeMap[req];
+    uint32_t value = reqValueMap[req];
+
+    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
+    for (int i = 0; i < num_edges; i++) {
+        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        Edge e = memoryToEdge(curr_edge_data);
+        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+        // TODO: Implement propagate function here
+        *update_data = value + 1;
+        PacketPtr update = getUpdatePacket(e.neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
+            requestorId);
+        if (sendPushUpdate(update) && (i == num_edges - 1)) {
+            memRespQueue.pop();
+            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
+            // TODO: Erase map entries here.
+        }
+    }
+
+    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+        schedule(nextPushEvent, nextCycle());
     }
-    return false;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 604df4750d..bf645eb119 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,15 +29,13 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class MPU;
-
-class PushEngine : public BasePushEngine
+class PushEngine : public BaseReadEngine
 {
   private:
     class ReqPort : public RequestPort
@@ -62,14 +60,38 @@ class PushEngine : public BasePushEngine
 
     ReqPort reqPort;
 
+    Addr baseEdgeAddr;
+
+    int pushReqQueueSize;
+    std::queue<WorkListItem> pushReqQueue;
+
+    // TODO: Possibility of infinite queueing
+    std::queue<PacketPtr> pendingReadReqs;
+
+    int memRespQueueSize;
+    int onTheFlyReadReqs;
+    std::queue<PacketPtr> memRespQueue;
+
+    EventFunctionWrapper nextAddrGenEvent;
+    void processNextAddrGenEvent();
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextPushEvent;
+    void processNextPushEvent();
+
   protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
+    virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
     PushEngine(const PushEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+
+    bool recvWLItem(WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 03f74f1019..f0c522ff6f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,15 +28,22 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+
 namespace gem5
 {
 
 WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
+    BaseReduceEngine(params),
+    respPort(name() + ".resp_port", this),
+    blockedByCoalescer(false),
+    coaleseEngine(params.coalesce_engine),
+    updateQueueSize(params.update_queue_size),
+    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    nextReadEvent([this]{ processNextReadEvent(); }, name()),
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+{
+    coaleseEngine->registerWLEngine(this);
+}
 
 Port&
 WLEngine::getPort(const std::string &if_name, PortID idx)
@@ -44,7 +51,7 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "resp_port") {
         return respPort;
     } else {
-        return BaseWLEngine::getPort(if_name, idx);
+        return BaseReduceEngine::getPort(if_name, idx);
     }
 }
 
@@ -53,6 +60,8 @@ WLEngine::startup()
 {
     //FIXME: This is the current version of our initializer.
     // This should be updated in the future.
+    //FIXME: The WLEngine no longer has a MemPort. Update this to
+    // work with the CoalesceEngine instead.
     WorkListItem vertices [5] = {
                                 {10000, 10000, 3, 0}, // Addr: 0
                                 {10000, 10000, 1, 3}, // Addr: 16
@@ -93,11 +102,6 @@ WLEngine::startup()
     handleWLUpdate(first_update);
 }
 
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -107,7 +111,7 @@ WLEngine::RespPort::getAddrRanges() const
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleWLUpdate(pkt);
+    return owner->handleIncomingUpdate(pkt);
 }
 
 Tick
@@ -131,26 +135,81 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
+    coaleseEngine->recvFunctional(pkt);
 }
 
-bool
-WLEngine::acquireAddress(Addr addr)
+AddrRangeList
+WLEngine::getAddrRanges()
 {
-    return lockDir->acquire(addr, requestorId);
+    return coaleseEngine->getAddrRanges();
+}
+
+void
+WLEngine::processNextReadEvent()
+{
+    PacketPtr update = updateQueue.front();
+    Addr update_addr = update->getAddr();
+    uint32_t update_value = update->getPtr<uint32_t>();
+
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
+        if (coalesceEngine->recvReadAddr(update_addr)) {
+            onTheFlyUpdateMap[update_addr] = update_value
+            updateQueue.pop();
+        }
+    } else {
+        // TODO: Generalize this to reduce function rather than just min
+        onTheFlyUpdateMap[update_addr] =
+                min(update_addr, onTheFlyUpdateMap[update_addr]);
+        updateQueue.pop();
+        // TODO: Add a stat to count the number of coalescions
+    }
+
+    if ((!nextReadEvent.scheduled()) &&
+        ((!updateQueue.empty()) ||
+        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        schedule(nextReadEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextReduceEvent()
+{
+    // TODO: Generalize this to reduce function rather than just min
+    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
+                                    currentWorkList.temp_prop);
+    // TODO: Add a delay here
+    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
+
+    onTheFlyUpdateMap.erase(currentWorkListAddress);
+    currentWorkListAddress = 0;
+    currentWorkList = {0, 0, 0, 0};
+}
+
+void
+WLEngine::scheduleReduceEvent()
+{
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if (!nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 bool
-WLEngine::releaseAddress(Addr addr)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    return lockDir->release(addr, requestorId);
+    // TODO: Coalesce updates here too
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+        return false;
+    }
+
+    updateQueue.push(pkt);
+    if ((!nextReadEvent.scheduled()) &&
+        (!updateQueue.empty())) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4e8a25795a..1846825951 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -32,17 +32,14 @@
 #include <queue>
 #include <unordered_map>
 
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
+class WLEngine : public BaseReduceEngine
 {
   private:
     class RespPort : public ResponsePort
@@ -64,22 +61,40 @@ class WLEngine : public BaseWLEngine
     };
 
     RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
+
+    bool blockedByCoalescer;
+    CoalesceEngine* coaleseEngine;
+
+    int updateQueueSize;
+    std::queue<PacketPtr> updateQueue;
+
+    int onTheFlyUpdateMapSize;
+    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     virtual void startup();
+
     void recvFunctional(PacketPtr pkt);
 
+    AddrRangeList getAddrRanges() const;
+
+    EventFunctionWrapper nextReadEvent;
+    void processNextReadEvent();
+
+    EventFunctionWrapper nextReduceEvent;
+    void processNextReduceEvent();
+
   protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
+    virtual void scheduleReduceEvent() = 0;
 
   public:
     PARAMS(WLEngine);
+
     WLEngine(const WLEngineParams &params);
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
+
+    bool handleIncomingUpdate(PacketPtr pkt);
 };
 
 }

From 4cc59dc9487d376ee1185cabad60a7ead7b1b564 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 16:01:55 -0700
Subject: [PATCH 070/287] Finalizing source code. Before compile.

---
 src/accl/graph/base/SConscript         |  12 +-
 src/accl/graph/sega/CoalesceEngine.py  |  40 ++++
 src/accl/graph/sega/PushEngine.py      |  40 ++++
 src/accl/graph/sega/SConscript         |   8 +-
 src/accl/graph/sega/WLEngine.py        |  40 ++++
 src/accl/graph/sega/coalesce_engine.cc | 306 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  30 ++-
 7 files changed, 377 insertions(+), 99 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine.py
 create mode 100644 src/accl/graph/sega/PushEngine.py
 create mode 100644 src/accl/graph/sega/WLEngine.py

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c5c8c4e901..c6a78eb5e8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,15 +27,11 @@
 
 Import('*')
 
-SimObject('BaseApplyEngine.py')
-SimObject('BaseEngine.py')
-SimObject('BasePushEngine.py')
-SimObject('BaseWLEngine.py')
+SimObject('BaseReadEngine.py')
+SimObject('BaseReduceEngine.py')
 
-Source('base_apply_engine.cc')
-Source('base_engine.cc')
-Source('base_push_engine.cc')
-Source('base_wl_engine.cc')
+Source('base_read_engine.cc')
+Source('base_reduce_engine.cc')
 Source('util.cc')
 
 DebugFlag('MPU')
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
new file mode 100644
index 0000000000..0330da7576
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class CoalesceEngine(BaseReadEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+    
+    peer_push_engine = Param.PushEngine(NULL, "")
+    num_mshr_entry = Param.Int(4, "")
+    num_tgts_per_mshr = Param.Int(20, "")
+    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
new file mode 100644
index 0000000000..9036b4e401
--- /dev/null
+++ b/src/accl/graph/sega/PushEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReadEngine import BaseReadEngine
+
+class PushEngine(BaseReadEngine):
+    type = 'PushEngine'
+    cxx_header = "accl/graph/sega/push_engine.hh"
+    cxx_class = 'gem5::PushEngine'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    base_edge_addr = Param.Addr()
+    mem_resp_queue_size = Param.Int(0, "")
+    push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index e6d2f1fbbc..9b4629838b 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,12 +27,12 @@
 
 Import('*')
 
-SimObject('ApplyEngine.py')
-SimObject('LockDir.py')
+SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
-Source('apply_engine.cc')
-Source('lock_dir.cc')
+Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
+
+DebugFlag('MPU')
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
new file mode 100644
index 0000000000..ec9154b138
--- /dev/null
+++ b/src/accl/graph/sega/WLEngine.py
@@ -0,0 +1,40 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseReduceEngine import BaseReduceEngine
+
+class WLEngine(BaseReduceEngine):
+    type = 'WLEngine'
+    cxx_header = "accl/graph/sega/wl_engine.hh"
+    cxx_class = 'gem5::WLEngine'
+
+    resp_port = ResponsePort("Port to Receive updates from outside")
+    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    update_queue_size = Param.Int(0, "")
+    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1f7a94dc7e..22bc0d49a6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,14 +29,17 @@
 #include "accl/sega/coalesce_engine.hh"
 
 #include "accl/sega/wl_engine.hh"
+#include "debug/MPU.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
-    reqQueueSize(params.req_queue_size),
-    conflictAddrQueueSize(params.conflict_addr_queue_size),
+    peerPushEngine(params.peer_push_engine),
+    numMSHREntry(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
@@ -59,69 +62,100 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
-    assert(reqQueue.size() <= reqQueueSize);
-    if (reqQueue.size() == reqQueueSize) {
-        return false;
-    }
-
-    reqQueue.push(addr);
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    // TODO: Investigate this for optimization
-    Addr addr = reqQueue.front();
+    assert(MSHRMap.size() <= numMSHREntry);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    if (cacheBlocks[block_index].allocated) {
+    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: I guess this piece of code code could be optimized.
-        // Not the code per se. The design it represents.
-        if (cacheBlocks[block_index].addr == alligned_addr) {
-            if (!cacheBlocks[block_index].taken[wl_offset]) {
-                if (cacheBlocks[block_index].valid) {
-                    peerWLEngine->handleIncomingWL(addr,
-                        cacheBlocks[block_index].items[wl_offset]);
-                    cacheBlocks[block_index].taken[wl_offset] = true;
-                } else {
-                    cacheBlocks[block_index].pending[wl_offset] = true;
-                }
-                reqQueue.pop();
-            }
-        } else { // conflict
-            assert(conflictAddrQueue.size() <= conflictAddrQueueSize);
-            if (conflictAddrQueue.size() < conflictAddrQueueSize) {
-                cacheBlocks[block_index].numConflicts += 1;
-                conflictAddrQueue.push(addr);
-                reqQueue.pop();
-            }
+        addrResponseQueue.push(addr);
+        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        if ((!nextRespondEvent.scheduled()) &&
+            (!worklistResponseQueue.empty()) && 
+            (!addrResponseQueue.empty())) {
+            schedule(nextRespondEvent, nextCycle());
         }
+        return true;
     } else {
         // miss
-        cacheBlocks[block_index].addr = alligned_addr;
-        cacheBlocks[block_index].numConflicts = 0;
-        cacheBlocks[block_index].pending = {false, false, false, false};
-        cacheBlocks[block_index].pending[wl_offset] = true;
-        cacheBlocks[block_index].taken = {false, false, false, false};
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].allocated = true;
-
-        PacketPtr pkt = getReadPacket(alligned_addr, 64, _requestorId);
-
-        if (!memPortBlocked()) {
-            sendMemReq(pkt);
-            reqQueue.pop();
+        if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            if (MSHRMap.size() == numMSHREntry) {
+                // Out of MSHR entries
+                return false;
+            } else {
+                if (cacheBlock[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        return false;
+                    }
+                    // MSHR available but conflict
+                    cacheBlocks[block_index].hasConflict = true;
+                    MSHRMap[block_index].push_back(addr);
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    assert(
+                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueueSize);
+                    if (outstandingMemReqQueue.size() == 
+                        outstandingMemReqQueueSize) {
+                        return false;
+                    }
+                    cacheBlocks[block_index].addr = alligned_addr;
+                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].allocated = true;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].hasConflict = false;
+
+                    MSHRMap[block_index].push_back(addr);
+                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                                                64, _requestorId);
+                    outstandingMemReqQueue.push(pkt);
+
+                    if ((!nextMemReqEvent.scheduled()) &&
+                        (!outstandingMemReqQueue.empty())) {
+                        schedule(nextMemReqEvent, nextCycle());
+                    }
+                    return true;
+                }
+            }
         }
+    }   
+}
+
+void
+CoalesceEngine::processNextMemReqEvent()
+{
+    PacketPtr pkt = outstandingMemReqQueue.front();
+
+    if (!memPortBlocked()) {
+        sendMemReq(pkt);
+        outstandingMemReqQueue.pop();
+    }
+
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
     }
+}
+
+void
+CoalesceEngine::processNextRespondEvent()
+{
+    Addr addr_response = addrResponseQueue.front();
+    WorkListItem worklist_response = worklistResponseQueue.front();
+    
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    if ((!nextRespondEvent.scheduled()) && (!reqQueue.empty())) {
+    addrResponseQueue.pop();
+    worklistResponseQueue.pop();
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
@@ -139,19 +173,50 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t data = pkt->getPtr<uint8_t>();
-
     int block_index = addr % 256;
+
+    assert((cacheBlocks[block_index].allocated) && // allocated cache block
+            (!cacheBlocks[block_index].valid) &&    // valid is false
+            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
     cacheBlocks[block_index].valid = true;
 
-    for (i = 0; i < 4; i++) {
+    for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
-        cacheBlocks[block_index].taken[i] = false;
-        if (cacheBlocks[block_index].pending[i]) {
-            peerWLEngine->handleIncomingWL(addr + (i * 16),
-                cacheBlocks[block_index].items[i]);
-            cacheBlocks[block_index].taken[i] = true;
+    }
+
+    int bias = 0;
+    std::vector<int> servicedIndices;
+    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
+        Addr miss_addr = MSHRMap[block_index][i];
+        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+
+        if (alligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+            addrResponseQueue.push(miss_addr);
+            worklistResponseQueue.push(
+                cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            servicedIndices.push_back(i);    
         }
-        cacheBlocks[block_index].pending = false;
+    }
+    // TODO: We Can use taken instead of this
+    for (int i = 0; i < servicedIndices.size(); i++) {
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+                                    servicedIndices[i] - bias);
+        bias++;
+    }
+
+    if (MSHRMap[block_index].empty()) {
+        MSHRMap.erase(block_index);
+        cacheBlocks[block_index].hasConflict = false;
+    } else {
+        cacheBlocks[block_index].hasConflict = true;
+    }
+
+    if ((!nextRespondEvent.scheduled()) &&
+        (!worklistResponseQueue.empty()) && 
+        (!addrResponseQueue.empty())) {
+        schedule(nextRespondEvent, nextCycle());
     }
 }
 
@@ -162,26 +227,111 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
 
-    assert(cacheBlocks[block_index].taken[wl_offset]);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+            (1 << wl_offset));
     cacheBlocks[block_index].item[wl_offset] = wl;
-    cacheBlocks[block_index].taken[wl_offset] = false;
-
-    bool taken_item = false;
-    taken_item &= (cacheBlocks[block_index].taken[0] &
-                    cacheBlocks[block_index].taken[1] &
-                    cacheBlocks[block_index].taken[2] &
-                    cacheBlocks[block_index].taken[3]);
-
-    if (!taken_item) {
-        for (auto conflictAddr : conflictAddrQueue) {
-            int conflict_block_index = ((conflictAddr / 64) * 64) % 256;
-            if (conflict_block_index == block_index) {
-                // Evict cacheBlocks[block_index]
-                // Respond to conflictAddr
-            }
+    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    
+    // TODO: Make this more general and programmable.
+    // && (cacheBlocks[block_index].hasConflict)
+    if ((cacheBlocks[block_index].takenMask == 0)) {
+        evictQueue.push(block_index);
+    }
+
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
+
+}
+
+void
+CoalesceEngine::processNextApplyAndCommitEvent()
+{
+    int block_index = evictQueue.front();
+    uint8_t changedMask = 0;
+    uint8_t data[64];
+
+    for (int i = 0; i < 4; i++) {
+        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+        cacheBlocks[block_index].items[i].prop = std::min(
+            cacheBlocks[block_index].items[i].prop,
+            cacheBlocks[block_index].items[i].temp_prop);
+        if (old_prop != cacheBlocks[block_index].items[i].prop) {
+            changedMask |= (1 << i);
         }
+        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
+        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
     }
 
+    if (changed) {
+        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        PacketPtr write_pkt = getWritePacket(
+            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        
+        if ((cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+            Addr miss_addr = MSHRMap[block_index][0];
+            // TODO: Make sure this trick works;
+            Addr alligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = getReadPacket(
+                    alligned_miss_addr, 64, _requestorId);
+            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push(read_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            evictQueue.pop();
+        } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            outstandingMemReqQueue.push(write_pkt);
+            // TODO: This should be improved
+            if ((changedMask & (1)) == 1) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+            }
+            if ((changedMask & (2)) == 2) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+            }
+            if ((changedMask & (4)) == 4) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+            }
+            if ((changedMask & (8)) == 8) {
+                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+            }
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            evictQueue.pop();
+        } else {
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+                __func__);
+        }
+    }
+    
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle()); 
+    }
+    
+    if ((!nextApplyAndCommitEvent.scheduled()) &&
+        (!evictQueue.empty())) {
+        schedule(nextApplyAndCommitEvent, nextCycle());
+    }
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0b349b2c1a..f5fd85e4cf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/base/base_read_engine.hh"
+#include "accl/sega/push_engine.hh"
 
 namespace gem5
 {
@@ -43,22 +44,33 @@ class CoalesceEngine : public BaseReadEngine
     {
         WorkListItem items[4];
         Addr addr;
-        int numConflicts;
-        bool pending[4];
-        bool taken[4];
-        bool valid;
+        uint8_t takenMask;
         bool allocated;
+        bool valid;
+        bool hasConflict;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
     };
 
     WLEngine* peerWLEngine;
-
+    PushEngine* peerPushEngine;
+    
     Block cacheBlocks[256];
 
-    int reqQueueSize;
-    std::queue<Addr> reqQueue;
+    int numMSHREntry;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHRMap;
+
+    int outstandingMemReqQueueSize;
+    std::queue<PacketPtr> outstandingMemReqQueue;
+
+    std::queue<Addr> addrResponseQueue;
+    std::queue<WorkListItem> worklistResponseQueue;
+
+    std::queue<int> evictQueue;
 
-    int conflictAddrQueueSize;
-    std::queue<Addr> conflictAddrQueue;
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();

From 965a48e61fc7868cf4dfaa190ca99618f0c51d07 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 22 Mar 2022 17:31:55 -0700
Subject: [PATCH 071/287] Compiles.

---
 src/accl/graph/base/SConscript            |  2 --
 src/accl/graph/base/base_read_engine.cc   |  4 +--
 src/accl/graph/base/base_read_engine.hh   | 11 ++++----
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  9 ++++---
 src/accl/graph/base/util.hh               |  5 ++++
 src/accl/graph/sega/PushEngine.py         |  2 +-
 src/accl/graph/sega/WLEngine.py           |  2 +-
 src/accl/graph/sega/coalesce_engine.cc    | 31 ++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh    | 10 +++++---
 src/accl/graph/sega/push_engine.cc        | 24 ++++++++++++++----
 src/accl/graph/sega/push_engine.hh        |  7 +++++
 src/accl/graph/sega/wl_engine.cc          | 29 +++++++++++----------
 src/accl/graph/sega/wl_engine.hh          |  4 +--
 14 files changed, 88 insertions(+), 54 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index c6a78eb5e8..8aefca2185 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -33,5 +33,3 @@ SimObject('BaseReduceEngine.py')
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
 Source('util.cc')
-
-DebugFlag('MPU')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 4192cdb565..894831429b 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_read_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
 
 namespace gem5
 {
@@ -35,7 +35,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    _requestorId(system.getRequestorId(this)),
+    _requestorId(system->getRequestorId(this))
 {}
 
 BaseReadEngine::~BaseReadEngine()
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 99f14bcb06..956c50e47d 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
 #include <queue>
 #include <unordered_map>
@@ -35,7 +35,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseEngine.hh"
+#include "params/BaseReadEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
@@ -53,7 +53,7 @@ class BaseReadEngine : public ClockedObject
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseEngine* owner):
+        MemPort(const std::string& name, BaseReadEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -69,8 +69,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    bool handleMemResp(PacketPtr resp);
-
   protected:
     const RequestorID _requestorId;
 
@@ -85,6 +83,7 @@ class BaseReadEngine : public ClockedObject
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
+    
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index fbfc613313..82643ba3ff 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -26,7 +26,7 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/base/base_reduce_engine.hh"
+#include "accl/graph/base/base_reduce_engine.hh"
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index e44f384f26..7851eaf585 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -26,11 +26,11 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_REDUCE_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
 
-#include "accl/base/util.hh"
+#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +43,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-    bool handleIncomingWL(Addr addr, WorkListItem wl);
 
   protected:
     Addr currentWorkListAddress;
@@ -60,6 +59,8 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
+
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index a4418a1cb8..1066d37d1c 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -26,6 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
+#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
+#define __ACCL_GRAPH_BASE_UTIL_HH__
+
 #include "base/cprintf.hh"
 #include "base/types.hh"
 #include "mem/packet.hh"
@@ -75,3 +78,5 @@ PacketPtr getUpdatePacket(Addr addr, unsigned int size,
                 uint8_t *data, RequestorID requestorId);
 
 }
+
+#endif // __ACCL_GRAPH_BASE_UTIL_HH__
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 9036b4e401..129d9454c7 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,6 +35,6 @@ class PushEngine(BaseReadEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr()
+    base_edge_addr = Param.Addr("")
     mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index ec9154b138..cab47fbe7b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,6 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoaleseEngine(NULL, "")
+    coalesce_engine = Param.CoalesceEngine(NULL, "")
     update_queue_size = Param.Int(0, "")
     on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 22bc0d49a6..663559cc63 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -26,9 +26,9 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/sega/coalesce_engine.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
 
-#include "accl/sega/wl_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
 
 namespace gem5
@@ -40,12 +40,13 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    nextWorkListSendEvent([this] { processNextWorkListSendEvent(); }, name()),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-CoalesceEngine::~CoalesceEngine()
-{}
+// CoalesceEngine::~CoalesceEngine()
+// {}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -86,8 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 return false;
             } else {
-                if (cacheBlock[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR)
+                if (cacheBlocks[block_index].allocated) {
+                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
                         return false;
                     }
@@ -122,6 +123,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     return true;
                 }
             }
+        } else {
+            assert(cacheBlocks[block_index].hasConflict);
+            MSHRMap[block_index].push_back(addr);
+            return true;
         }
     }   
 }
@@ -167,12 +172,12 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResp() && pkt->isWrite()) {
+    if (pkt->isResponse() && pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
     int block_index = addr % 256;
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
@@ -218,6 +223,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
+
+    return true;
 }
 
 void
@@ -229,7 +236,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
             (1 << wl_offset));
-    cacheBlocks[block_index].item[wl_offset] = wl;
+    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     
     // TODO: Make this more general and programmable.
@@ -261,10 +268,10 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             changedMask |= (1 << i);
         }
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data[i * 16], wl_data, sizeof(WorkListItem));
+        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
 
-    if (changed) {
+    if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f5fd85e4cf..6086a8855e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,8 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/base/base_read_engine.hh"
-#include "accl/sega/push_engine.hh"
+#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "params/CoalesceEngine.hh"
 
 namespace gem5
 {
@@ -85,7 +87,7 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    ~CoalesceEngine();
+    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
@@ -93,7 +95,7 @@ class CoalesceEngine : public BaseReadEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
-}
+};
 
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c865451999..2a978cfcc5 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "debug/MPU.hh"
+
 namespace gem5
 {
 
@@ -35,8 +37,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     BaseReadEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    memRespQueueSize(params.mem_resp_queue_size),
     pushReqQueueSize(params.push_req_queue_size),
+    memRespQueueSize(params.mem_resp_queue_size),
     onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextReadEvent([this] { processNextReadEvent(); }, name()),
@@ -87,7 +89,7 @@ PushEngine::ReqPort::recvReqRetry()
 }
 
 bool
-PushEngine::recvWLItem(WorkListItem wl);
+PushEngine::recvWLItem(WorkListItem wl)
 {
     assert(pushReqQueue.size() <= pushReqQueueSize);
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
@@ -133,14 +135,14 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
+        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
         pendingReadReqs.push(pkt);
     }
 
-    pushReadReqs.pop();
+    pushReqQueue.pop();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -176,6 +178,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
+    return true;
 }
 
 void
@@ -199,7 +202,8 @@ PushEngine::processNextPushEvent()
         *update_data = value + 1;
         PacketPtr update = getUpdatePacket(e.neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
+            _requestorId);
+
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
@@ -213,4 +217,14 @@ PushEngine::processNextPushEvent()
     }
 }
 
+bool
+PushEngine::sendPushUpdate(PacketPtr pkt)
+{
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(pkt);
+        return true;
+    }
+    return false;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index bf645eb119..e97a26c7bd 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/util.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -65,6 +66,10 @@ class PushEngine : public BaseReadEngine
     int pushReqQueueSize;
     std::queue<WorkListItem> pushReqQueue;
 
+    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
+    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
+    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+
     // TODO: Possibility of infinite queueing
     std::queue<PacketPtr> pendingReadReqs;
 
@@ -72,6 +77,8 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    bool sendPushUpdate(PacketPtr pkt);
+
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f0c522ff6f..43ad112db3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -36,13 +36,13 @@ WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
     blockedByCoalescer(false),
-    coaleseEngine(params.coalesce_engine),
+    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name())
 {
-    coaleseEngine->registerWLEngine(this);
+    coalesceEngine->registerWLEngine(this);
 }
 
 Port&
@@ -82,14 +82,14 @@ WLEngine::startup()
         uint8_t* data = workListToMemory(vertices[i]);
         PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     for (int i = 0; i < 7; i++) {
         uint8_t* data = edgeToMemory(edges[i]);
         PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
                                         16, data, 0);
-        sendMemFunctional(pkt);
+        coalesceEngine->recvFunctional(pkt);
     }
 
     uint8_t* first_update_data = new uint8_t [4];
@@ -97,9 +97,9 @@ WLEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
+        0, 4, first_update_data, _requestorId);
 
-    handleWLUpdate(first_update);
+    handleIncomingUpdate(first_update);
 }
 
 AddrRangeList
@@ -135,13 +135,13 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::recvFunctional(PacketPtr pkt)
 {
-    coaleseEngine->recvFunctional(pkt);
+    coalesceEngine->recvFunctional(pkt);
 }
 
 AddrRangeList
-WLEngine::getAddrRanges()
+WLEngine::getAddrRanges() const
 {
-    return coaleseEngine->getAddrRanges();
+    return coalesceEngine->getAddrRanges();
 }
 
 void
@@ -149,18 +149,18 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getPtr<uint32_t>();
+    uint32_t* update_value = update->getPtr<uint32_t>();
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
-            onTheFlyUpdateMap[update_addr] = update_value
+            onTheFlyUpdateMap[update_addr] = *update_value;
             updateQueue.pop();
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
         onTheFlyUpdateMap[update_addr] =
-                min(update_addr, onTheFlyUpdateMap[update_addr]);
+                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
         // TODO: Add a stat to count the number of coalescions
     }
@@ -176,8 +176,9 @@ void
 WLEngine::processNextReduceEvent()
 {
     // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = min(onTheFlyUpdateMap[currentWorkListAddress],
-                                    currentWorkList.temp_prop);
+    currentWorkList.temp_prop = std::min(
+                                onTheFlyUpdateMap[currentWorkListAddress],
+                                currentWorkList.temp_prop);
     // TODO: Add a delay here
     coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1846825951..3ce01dd69d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -63,7 +63,7 @@ class WLEngine : public BaseReduceEngine
     RespPort respPort;
 
     bool blockedByCoalescer;
-    CoalesceEngine* coaleseEngine;
+    CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
     std::queue<PacketPtr> updateQueue;
@@ -84,7 +84,7 @@ class WLEngine : public BaseReduceEngine
     void processNextReduceEvent();
 
   protected:
-    virtual void scheduleReduceEvent() = 0;
+    virtual void scheduleReduceEvent();
 
   public:
     PARAMS(WLEngine);

From df5706a46ff4b39293a26c4b3c06dc7aee1aa2d5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:34:29 -0700
Subject: [PATCH 072/287] Debugging after compilation. Loop writting to mem

---
 configs/accl/sega.py                      | 28 +++++---
 src/accl/graph/base/base_reduce_engine.cc |  8 ---
 src/accl/graph/base/base_reduce_engine.hh |  4 +-
 src/accl/graph/sega/coalesce_engine.cc    | 83 +++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh    |  4 +-
 src/accl/graph/sega/push_engine.cc        |  5 +-
 src/accl/graph/sega/wl_engine.cc          | 60 +++++++++++-----
 src/accl/graph/sega/wl_engine.hh          |  6 +-
 8 files changed, 126 insertions(+), 72 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 163ea169d9..f71b0e73e0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,15 +4,12 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.lock_dir = LockDirectory()
-        self.push_engine = PushEngine()
-        self.apply_engine = ApplyEngine(push_engine = self.push_engine, lock_dir = self.lock_dir)
-        self.wl_engine = WLEngine(apply_engine = self.apply_engine, lock_dir = self.lock_dir)
+        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
-
-        self.interconnect.cpu_side_ports = self.wl_engine.mem_port
-        self.interconnect.cpu_side_ports = self.apply_engine.mem_port
+        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
@@ -30,6 +27,16 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
@@ -40,8 +47,9 @@ def __init__(self):
 
         self.mpu = MPU()
         self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl(dram = DDR4_2400_8x8(range=AddrRange("4GiB")))
-
+        # self.mem_ctrl = MemCtrl()
+        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
+        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.port)
 
@@ -50,6 +58,6 @@ def __init__(self):
 
 m5.instantiate()
 
-exit_event = m5.simulate(1000000)
+exit_event = m5.simulate()
 print("Simulation finished!")
 exit()
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 82643ba3ff..38a8662ed0 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -40,12 +40,4 @@ BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
 BaseReduceEngine::~BaseReduceEngine()
 {}
 
-void
-BaseReduceEngine::handleIncomingWL(Addr addr, WorkListItem wl)
-{
-    currentWorkListAddress = addr;
-    currentWorkList = wl;
-    scheduleReduceEvent();
-}
-
 }
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 7851eaf585..64d6e4c8c0 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -50,8 +50,6 @@ class BaseReduceEngine : public ClockedObject
 
     const RequestorID _requestorId;
 
-    virtual void scheduleReduceEvent() = 0;
-
   public:
     PARAMS(BaseReduceEngine);
 
@@ -60,7 +58,7 @@ class BaseReduceEngine : public ClockedObject
 
     RequestorID requestorId() { return _requestorId; }
 
-    void handleIncomingWL(Addr addr, WorkListItem wl);
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 663559cc63..aa6bc99887 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,8 +45,16 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
 {}
 
-// CoalesceEngine::~CoalesceEngine()
-// {}
+void
+CoalesceEngine::startup()
+{
+    for (int i = 0; i < 256; i++) {
+        cacheBlocks[i].takenMask = 0;
+        cacheBlocks[i].allocated = false;
+        cacheBlocks[i].valid = false;
+        cacheBlocks[i].hasConflict = false;
+    }
+}
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -64,6 +72,8 @@ bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
+    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+                                                    __func__, addr);
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
@@ -71,11 +81,13 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == alligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
+                        , __func__, addr);
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) && 
+            (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -93,18 +105,26 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     // MSHR available but conflict
+                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
+                                "conflict. Making a request for "
+                                "alligned_addr: %lu.\n",
+                                __func__, addr, alligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
                     // MSHR available and no conflict
                     assert(
-                        outstandingMemReqQueue.size() <= 
+                        outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
-                    if (outstandingMemReqQueue.size() == 
+                    if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
                         return false;
                     }
+                    DPRINTF(MPU, "%s: Read request with addr: "
+                                "%lu missed with no conflict. "
+                                "Making a request for alligned_addr: %lu.\n"
+                                , __func__, addr, alligned_addr);
                     cacheBlocks[block_index].addr = alligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
@@ -112,7 +132,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr, 
+                    PacketPtr pkt = getReadPacket(alligned_addr,
                                                 64, _requestorId);
                     outstandingMemReqQueue.push(pkt);
 
@@ -124,11 +144,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
-            assert(cacheBlocks[block_index].hasConflict);
+            if ((!cacheBlocks[block_index].hasConflict) &&
+                ((addr < cacheBlocks[block_index].addr) ||
+                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                cacheBlocks[block_index].hasConflict = true;
+            }
             MSHRMap[block_index].push_back(addr);
             return true;
         }
-    }   
+    }
 }
 
 void
@@ -143,7 +167,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
 }
 
@@ -152,23 +176,19 @@ CoalesceEngine::processNextRespondEvent()
 {
     Addr addr_response = addrResponseQueue.front();
     WorkListItem worklist_response = worklistResponseQueue.front();
-    
+
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
     addrResponseQueue.pop();
     worklistResponseQueue.pop();
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
-/*
-    void recvWLWrite(Addr addr, WorkListItem wl);
-*/
-
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -183,11 +203,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
-    cacheBlocks[block_index].valid = true;
 
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
     }
+    cacheBlocks[block_index].valid = true;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -201,12 +221,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            servicedIndices.push_back(i);    
+            servicedIndices.push_back(i);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() + 
+        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
     }
@@ -219,7 +239,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) && 
+        (!worklistResponseQueue.empty()) &&
         (!addrResponseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
@@ -233,12 +253,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     Addr alligned_addr = (addr / 64) * 64;
     int block_index = alligned_addr % 256;
     int wl_offset = (addr - alligned_addr) / 16;
-
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) == 
+    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
+                                    __func__, addr, wl.to_string());
+    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, alligned_addr,
+            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    
+
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
@@ -267,6 +291,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
+        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
+                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
+                    i, cacheBlocks[block_index].items[i].to_string());
         uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
         std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
     }
@@ -275,7 +302,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = getWritePacket(
             cacheBlocks[block_index].addr, 64, data, _requestorId);
-        
+
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
@@ -304,7 +331,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop();
         } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) { 
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
@@ -325,16 +352,16 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop();
         } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" , 
+            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
     }
-    
+
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle()); 
+        schedule(nextMemReqEvent, nextCycle());
     }
-    
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6086a8855e..6dc7bc1001 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -56,7 +56,7 @@ class CoalesceEngine : public BaseReadEngine
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
-    
+
     Block cacheBlocks[256];
 
     int numMSHREntry;
@@ -71,6 +71,8 @@ class CoalesceEngine : public BaseReadEngine
 
     std::queue<int> evictQueue;
 
+    virtual void startup();
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 2a978cfcc5..06b5381641 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,6 +95,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
+
     pushReqQueue.push(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -204,10 +205,10 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
             _requestorId);
 
+        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
+                , __func__, e.to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 43ad112db3..b7f59987cb 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -154,45 +154,70 @@ WLEngine::processNextReadEvent()
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
+            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                            "update_addr: %lu, update_value: %u.\n",
+                            __func__, update_addr, *update_value);
             onTheFlyUpdateMap[update_addr] = *update_value;
+            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
             updateQueue.pop();
+            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
+        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
+                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
+                            __func__, update_addr, *update_value,
+                            onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         updateQueue.pop();
+        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
 
     if ((!nextReadEvent.scheduled()) &&
-        ((!updateQueue.empty()) ||
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize))) {
+        (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
 
 void
-WLEngine::processNextReduceEvent()
+WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    // TODO: Generalize this to reduce function rather than just min
-    currentWorkList.temp_prop = std::min(
-                                onTheFlyUpdateMap[currentWorkListAddress],
-                                currentWorkList.temp_prop);
-    // TODO: Add a delay here
-    coalesceEngine->recvWLWrite(currentWorkListAddress, currentWorkList);
-
-    onTheFlyUpdateMap.erase(currentWorkListAddress);
-    currentWorkListAddress = 0;
-    currentWorkList = {0, 0, 0, 0};
+    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    addrWorkListMap[addr] = wl;
+    // TODO: Add checks to see if scheduling is necessary or correct.
+    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+        schedule(nextReduceEvent, nextCycle());
+    }
 }
 
 void
-WLEngine::scheduleReduceEvent()
+WLEngine::processNextReduceEvent()
 {
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if (!nextReduceEvent.scheduled()) {
-        schedule(nextReduceEvent, nextCycle());
+
+    std::unordered_map<Addr, WorkListItem>::iterator it =
+                    addrWorkListMap.begin();
+
+    std::vector<Addr> servicedAddresses;
+    while (it != addrWorkListMap.end()) {
+        Addr addr = it->first;
+        WorkListItem wl = it->second;
+        uint32_t update_value = onTheFlyUpdateMap[addr];
+        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
+                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    onTheFlyUpdateMap[addr]);
+        // TODO: Generalize this to reduce function rather than just min
+        wl.temp_prop = std::min(update_value, wl.temp_prop);
+        coalesceEngine->recvWLWrite(addr, wl);
+        servicedAddresses.push_back(addr);
+        it++;
+    }
+
+    addrWorkListMap.clear();
+    for (int i = 0; i < servicedAddresses.size(); i++) {
+        onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
 }
 
@@ -206,6 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push(pkt);
+    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3ce01dd69d..1ccb13d91e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,6 +71,7 @@ class WLEngine : public BaseReduceEngine
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
+    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
     virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
@@ -83,9 +84,6 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
-  protected:
-    virtual void scheduleReduceEvent();
-
   public:
     PARAMS(WLEngine);
 
@@ -95,6 +93,8 @@ class WLEngine : public BaseReduceEngine
                   PortID idx=InvalidPortID) override;
 
     bool handleIncomingUpdate(PacketPtr pkt);
+
+    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From ca2f0692bf3cf8fcd4b4459e1b352c6d795b95b0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 23 Mar 2022 00:51:48 -0700
Subject: [PATCH 073/287] Correctness tested with small graph.

---
 src/accl/graph/sega/coalesce_engine.cc | 23 ++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  2 +-
 2 files changed, 21 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index aa6bc99887..62062116c2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -265,8 +265,19 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
+    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        evictQueue.push(block_index);
+        for (auto index : evictQueue) {
+            if (block_index == index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
+        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -329,7 +340,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
             outstandingMemReqQueue.push(write_pkt);
@@ -350,11 +363,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop();
+            evictQueue.pop_front();
+            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
+                __func__, evictQueue.size());
         } else {
             DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
                 __func__);
         }
+    } else {
+        evictQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6dc7bc1001..3290f646f4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -69,7 +69,7 @@ class CoalesceEngine : public BaseReadEngine
     std::queue<Addr> addrResponseQueue;
     std::queue<WorkListItem> worklistResponseQueue;
 
-    std::queue<int> evictQueue;
+    std::deque<int> evictQueue;
 
     virtual void startup();
 

From 358c8e6e9e0a59f7a5a3d6f780e47b559d3e524e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 23 Mar 2022 09:53:26 -0700
Subject: [PATCH 074/287] Added performance statistics.

---
 src/accl/graph/sega/coalesce_engine.cc | 32 +++++++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 18 +++++++++++++++
 src/accl/graph/sega/wl_engine.cc       | 22 +++++++++++++++++-
 src/accl/graph/sega/wl_engine.hh       | 15 ++++++++++++
 4 files changed, 85 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62062116c2..d58a36188e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,7 +42,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name())
+    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    stats(*this)
 {}
 
 void
@@ -86,6 +87,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        stats.readHits++;
+        stats.numVertexReads++;
         if ((!nextRespondEvent.scheduled()) &&
             (!worklistResponseQueue.empty()) &&
             (!addrResponseQueue.empty())) {
@@ -138,6 +141,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
 
                     if ((!nextMemReqEvent.scheduled()) &&
                         (!outstandingMemReqQueue.empty())) {
+                        stats.numVertexBlockReads++;
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
@@ -221,6 +225,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             worklistResponseQueue.push(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            stats.numVertexReads++;
             servicedIndices.push_back(i);
         }
     }
@@ -262,6 +267,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    stats.numVertexWrites++;
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
@@ -376,6 +382,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
+        stats.numVertexBlockWrites++;
         schedule(nextMemReqEvent, nextCycle());
     }
 
@@ -385,4 +392,27 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     }
 }
 
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+
+    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
+             "Number of memory blocks writes for vertecies"),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3290f646f4..d45fffa3aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/base_read_engine.hh"
 #include "accl/graph/base/util.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
 namespace gem5
@@ -82,6 +83,23 @@ class CoalesceEngine : public BaseReadEngine
     EventFunctionWrapper nextApplyAndCommitEvent;
     void processNextApplyAndCommitEvent();
 
+    struct CoalesceStats : public statistics::Group
+    {
+      CoalesceStats(CoalesceEngine &coalesce);
+
+      void regStats() override;
+
+      CoalesceEngine &coalesce;
+
+      statistics::Scalar numVertexBlockReads;
+      statistics::Scalar numVertexBlockWrites;
+      statistics::Scalar numVertexReads;
+      statistics::Scalar numVertexWrites;
+      statistics::Scalar readHits;
+    };
+
+    CoalesceStats stats;
+
   protected:
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b7f59987cb..517d10ef67 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,8 @@ WLEngine::WLEngine(const WLEngineParams &params):
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
-    nextReduceEvent([this]{ processNextReduceEvent(); }, name())
+    nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    stats(*this)
 {
     coalesceEngine->registerWLEngine(this);
 }
@@ -171,6 +172,7 @@ WLEngine::processNextReadEvent()
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+        stats.onTheFlyCoalesce++;
         updateQueue.pop();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
@@ -209,6 +211,7 @@ WLEngine::processNextReduceEvent()
                     "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        stats.numReduce++;
         wl.temp_prop = std::min(update_value, wl.temp_prop);
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
@@ -239,4 +242,21 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
+WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
+    : statistics::Group(&_wl),
+    wl(_wl),
+
+    ADD_STAT(numReduce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies")
+{
+}
+
+void
+WLEngine::WorkListStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1ccb13d91e..891916e7af 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
@@ -84,6 +85,20 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    struct WorkListStats : public statistics::Group
+    {
+      WorkListStats(WLEngine &worklist);
+
+      void regStats() override;
+
+      WLEngine &wl;
+
+      statistics::Scalar numReduce;
+      statistics::Scalar onTheFlyCoalesce;
+    };
+
+    WorkListStats stats;
+
   public:
     PARAMS(WLEngine);
 

From c6ae6a6c93f0527d83044d4b207a9507a779a1b3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 14:10:40 -0700
Subject: [PATCH 075/287] Updating definitions for structs and removing
 unnecessary funcs.

---
 configs/accl/sega.py                      |  50 +++++---
 src/accl/graph/base/base_read_engine.cc   |  15 +++
 src/accl/graph/base/base_read_engine.hh   |   4 +-
 src/accl/graph/base/base_reduce_engine.hh |   2 -
 src/accl/graph/base/util.cc               | 145 ----------------------
 src/accl/graph/base/util.hh               |  54 ++++----
 src/accl/graph/sega/coalesce_engine.cc    |  98 ++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh    |  12 +-
 src/accl/graph/sega/push_engine.cc        |  42 ++++++-
 src/accl/graph/sega/push_engine.hh        |   4 +
 src/accl/graph/sega/wl_engine.cc          |  59 ++-------
 src/accl/graph/sega/wl_engine.hh          |   1 -
 12 files changed, 201 insertions(+), 285 deletions(-)
 delete mode 100644 src/accl/graph/base/util.cc

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index f71b0e73e0..8ea247106e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,9 +4,13 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000, push_req_queue_size = 16)
-        self.coalesce_engine = CoalesceEngine(peer_push_engine=self.push_engine)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine, update_queue_size = 16, on_the_fly_update_map_size=8)
+        self.push_engine = PushEngine(base_edge_addr=0x100000,
+                                    push_req_queue_size = 16)
+        self.coalesce_engine = CoalesceEngine(
+                                    peer_push_engine=self.push_engine)
+        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
+                                    update_queue_size = 16,
+                                    on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
@@ -27,31 +31,41 @@ def getMemPort(self):
     def setMemPort(self, port):
         self.interconnect.mem_side_ports = port
 
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
+class MPUMemory(SubSystem):
+    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+        super(MPUMemory, self).__init__()
+        self.vertex_mem_ctrl = SimpleMemory(
+            range=vertex_range, bandwidth="25GB/s",
+            latency="30ns", image_file=vertex_binary)
+        self.edge_mem_ctrl = SimpleMemory(
+            range=edge_range, bandwidth="25GB/s",
+            latency="30ns", image_file=edge_binary)
+        self.interconnect = SystemXBar()
+
+        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
+        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
 
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
+    def getPort(self):
+        return self.interconnect.cpu_side_ports
+    def setPort(self, port):
+        self.interconnect.cpu_side_ports = port
 
 class SEGA(System):
     def __init__(self):
         super(SEGA, self).__init__()
-
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
         self.mpu = MPU()
-        self.mem_ctrl = SimpleMemory(range=AddrRange("4GiB"), bandwidth="1000GB/s", latency = "30ns")
-        # self.mem_ctrl = MemCtrl()
-        # self.mem_ctrl.dram = DDR4_2400_8x8(range=AddrRange(start=0x000000, size="1MiB"))
-        # self.mem_ctrl.nvm = NVM_2400_1x64(range=AddrRange(start=0x100000, size="1MiB"))
+        self.mem_ctrl = MPUMemory(
+            vertex_range=AddrRange(start=0x000000, size="2GiB"),
+            vertex_binary="live-journal/graph_binaries/vertices",
+            edge_range=AddrRange(start=0x80000000, size="2GiB"),
+            edge_binary="live-journal/graph_binaries/edgelist_0")
+
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.port)
+        self.mpu.setMemPort(self.mem_ctrl.getPort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 894831429b..a32237db35 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -83,4 +83,19 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+PacketPtr
+BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 956c50e47d..591b51aeb7 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -78,12 +78,14 @@ class BaseReadEngine : public ClockedObject
 
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+
   public:
     PARAMS(BaseReadEngine);
 
     BaseReadEngine(const BaseReadEngineParams &params);
     ~BaseReadEngine();
-    
+
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
 
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index 64d6e4c8c0..f2245f571f 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -45,8 +45,6 @@ class BaseReduceEngine : public ClockedObject
 
 
   protected:
-    Addr currentWorkListAddress;
-    WorkListItem currentWorkList;
 
     const RequestorID _requestorId;
 
diff --git a/src/accl/graph/base/util.cc b/src/accl/graph/base/util.cc
deleted file mode 100644
index 4172607ed0..0000000000
--- a/src/accl/graph/base/util.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/util.hh"
-
-namespace gem5
-{
-
-WorkListItem
-memoryToWorkList(uint8_t* data){
-    WorkListItem wl;
-
-    uint32_t temp_prop = *((uint32_t*) data);
-    uint32_t prop = *((uint32_t*) (data + 4));
-    uint32_t degree = *((uint32_t*) (data + 8));
-    uint32_t addr = *((uint32_t*) (data + 12));
-
-    wl  = {temp_prop, prop, degree, addr};
-    return wl;
-}
-
-uint8_t*
-workListToMemory(WorkListItem wl){
-    int  data_size = sizeof(WorkListItem) / sizeof(uint8_t);
-    uint8_t* data = new uint8_t [data_size];
-
-    uint32_t* tempPtr = (uint32_t*) data;
-    *tempPtr = wl.temp_prop;
-
-    uint32_t* propPtr = (uint32_t*) (data + 4);
-    *propPtr = wl.prop;
-
-    uint32_t* degreePtr = (uint32_t*) (data + 8);
-    *degreePtr = wl.degree;
-
-    uint32_t* edgePtr = (uint32_t*) (data + 12);
-    *edgePtr = wl.edgeIndex;
-
-    return data;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-Edge
-memoryToEdge(uint8_t *data)
-{
-    uint64_t weight = *((uint64_t*) data);
-    Addr neighbor = *((Addr*) (data + 8)); // data + 8 because weight: 8 bytes
-    Edge e = {weight, neighbor};
-    return e;
-}
-
-// Edge: (weight: 64 bits, neighbor: 64 bits)
-uint8_t*
-edgeToMemory(Edge e)
-{
-    int data_size = (int) ((sizeof(Edge)) / (sizeof(uint8_t)));
-
-    uint8_t* data = new uint8_t [data_size];
-
-    uint64_t* weightPtr = (uint64_t*) data;
-    *weightPtr = e.weight;
-
-    Addr* neighborPtr = (Addr*) (data + 8); // data + 8 because weight: 8 bytes
-    *neighborPtr = e.neighbor;
-
-    return data;
-}
-
-PacketPtr
-getReadPacket(Addr addr, unsigned int size, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-getWritePacket(Addr addr, unsigned int size,
-            uint8_t* data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-PacketPtr
-getUpdatePacket(Addr addr, unsigned int size,
-            uint8_t *data, RequestorID requestorId)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0,
-                                               requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr)requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-}
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/util.hh
index 1066d37d1c..b51a9f0781 100644
--- a/src/accl/graph/base/util.hh
+++ b/src/accl/graph/base/util.hh
@@ -30,52 +30,56 @@
 #define __ACCL_GRAPH_BASE_UTIL_HH__
 
 #include "base/cprintf.hh"
-#include "base/types.hh"
-#include "mem/packet.hh"
-#include "mem/request.hh"
 
 namespace gem5
 {
 
-struct WorkListItem
+struct __attribute__ ((packed)) WorkListItem
 {
-    uint32_t temp_prop;
-    uint32_t prop;
-    uint32_t degree;
-    uint32_t edgeIndex;
+    uint32_t tempProp : 32;
+    uint32_t prop : 32;
+    uint32_t degree : 32;
+    uint32_t edgeIndex : 32;
 
     std::string to_string()
     {
         return csprintf(
         "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        temp_prop, prop, degree, edgeIndex);
+        tempProp, prop, degree, edgeIndex);
     }
 
+    WorkListItem():
+        tempProp(0),
+        prop(0),
+        degree(0),
+        edgeIndex(0)
+    {}
+
+    WorkListItem(uint32_t temp_prop, uint32_t prop,
+                uint32_t degree, uint32_t edge_index):
+        tempProp(temp_prop),
+        prop(prop),
+        degree(degree),
+        edgeIndex(edge_index)
+    {}
+
 };
 
-struct Edge
+struct __attribute__ ((packed)) Edge
 {
-    uint64_t weight;
-    Addr neighbor;
+    uint16_t weight : 16;
+    uint64_t neighbor : 48;
 
     std::string to_string()
     {
         return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
     }
-};
 
-WorkListItem memoryToWorkList(uint8_t* data);
-uint8_t* workListToMemory(WorkListItem wl);
-
-Edge memoryToEdge(uint8_t* data);
-uint8_t* edgeToMemory(Edge e);
-
-PacketPtr getReadPacket(Addr addr, unsigned int size,
-                            RequestorID requestorId);
-PacketPtr getWritePacket(Addr addr, unsigned int size,
-                uint8_t* data, RequestorID requestorId);
-PacketPtr getUpdatePacket(Addr addr, unsigned int size,
-                uint8_t *data, RequestorID requestorId);
+    Edge(uint16_t weight, uint64_t neighbor):
+        weight(weight),
+        neighbor(neighbor)
+    {}
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d58a36188e..67874cb9b9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,29 +75,33 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    if ((cacheBlocks[block_index].addr == alligned_addr) &&
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
+        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push(addr);
         worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+
         stats.readHits++;
         stats.numVertexReads++;
-        if ((!nextRespondEvent.scheduled()) &&
-            (!worklistResponseQueue.empty()) &&
-            (!addrResponseQueue.empty())) {
+
+        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
         return true;
     } else {
         // miss
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
                 return false;
@@ -110,12 +114,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     // MSHR available but conflict
                     DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
                                 "conflict. Making a request for "
-                                "alligned_addr: %lu.\n",
-                                __func__, addr, alligned_addr);
+                                "aligned_addr: %lu.\n",
+                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     return true;
                 } else {
+                    // TODO: Set valid to false every deallocation and
+                    // assert valid == false here.
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
@@ -126,31 +132,34 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     }
                     DPRINTF(MPU, "%s: Read request with addr: "
                                 "%lu missed with no conflict. "
-                                "Making a request for alligned_addr: %lu.\n"
-                                , __func__, addr, alligned_addr);
-                    cacheBlocks[block_index].addr = alligned_addr;
+                                "Making a request for aligned_addr: %lu.\n"
+                                , __func__, addr, aligned_addr);
+                    cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
 
                     MSHRMap[block_index].push_back(addr);
-                    PacketPtr pkt = getReadPacket(alligned_addr,
-                                                64, _requestorId);
+                    // TODO: Parameterize 64 to memory atom size
+                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
                     outstandingMemReqQueue.push(pkt);
 
-                    if ((!nextMemReqEvent.scheduled()) &&
-                        (!outstandingMemReqQueue.empty())) {
-                        stats.numVertexBlockReads++;
+                    stats.numVertexBlockReads++;
+
+                    assert(!outstandingMemReqQueue.empty());
+                    if (!nextMemReqEvent.scheduled()) {
                         schedule(nextMemReqEvent, nextCycle());
                     }
                     return true;
                 }
             }
         } else {
+            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                return false;
+            }
             if ((!cacheBlocks[block_index].hasConflict) &&
-                ((addr < cacheBlocks[block_index].addr) ||
-                (addr >= (cacheBlocks[block_index].addr + 64)))) {
+                (aligned_addr != cacheBlocks[block_index].addr)) {
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
@@ -196,20 +205,24 @@ CoalesceEngine::processNextRespondEvent()
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
-    if (pkt->isResponse() && pkt->isWrite()) {
+    assert(pkt->isResponse());
+    if (pkt->isWrite()) {
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256;
+    int block_index = addr % 256; // TODO: After parameterizing the cache size
+                                  // this 256 number should change to the cache
+                                  // size parameter.
 
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
     for (int i = 0; i < 4; i++) {
-        cacheBlocks[block_index].items[i] = memoryToWorkList(data + (i * 16));
+        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
+                                data + (i * sizeof(WorkListItem))));
     }
     cacheBlocks[block_index].valid = true;
 
@@ -252,16 +265,32 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+PacketPtr
+CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr alligned_addr = (addr / 64) * 64;
-    int block_index = alligned_addr % 256;
-    int wl_offset = (addr - alligned_addr) / 16;
+    Addr aligned_addr = (addr / 64) * 64;
+    int block_index = aligned_addr % 256;
+    int wl_offset = (addr - aligned_addr) / 16;
     DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
                                     __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: alligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, alligned_addr,
+    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
+            "takenMask: %u.\n", __func__, aligned_addr,
             block_index, wl_offset, cacheBlocks[block_index].takenMask);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -298,35 +327,36 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 {
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
+    // TODO: parameterize 64 to memory atom size
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].temp_prop);
+            cacheBlocks[block_index].items[i].tempProp);
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
         }
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = workListToMemory(cacheBlocks[block_index].items[i]);
-        std::memcpy(data + (i * 16), wl_data, sizeof(WorkListItem));
+        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        std::memcpy(data + (i * sizeof(WorkListItem)),
+                    wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-        PacketPtr write_pkt = getWritePacket(
-            cacheBlocks[block_index].addr, 64, data, _requestorId);
+        PacketPtr write_pkt = createWritePacket(
+            cacheBlocks[block_index].addr, 64, data);
 
         if ((cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
             Addr miss_addr = MSHRMap[block_index][0];
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = getReadPacket(
-                    alligned_miss_addr, 64, _requestorId);
+            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
             outstandingMemReqQueue.push(write_pkt);
             outstandingMemReqQueue.push(read_pkt);
             // TODO: This should be improved
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index d45fffa3aa..4bb21676d4 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -35,6 +35,8 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+// TODO: Add parameters for size, memory atom size, type size,
+// length of items in the blocks.
 namespace gem5
 {
 
@@ -53,6 +55,13 @@ class CoalesceEngine : public BaseReadEngine
         bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
+        Block():
+          addr(0),
+          takenMask(0),
+          allocated(false),
+          valid(false),
+          hasConflict(false)
+        {}
     };
 
     WLEngine* peerWLEngine;
@@ -74,6 +83,8 @@ class CoalesceEngine : public BaseReadEngine
 
     virtual void startup();
 
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
 
@@ -107,7 +118,6 @@ class CoalesceEngine : public BaseReadEngine
     PARAMS(CoalesceEngine);
 
     CoalesceEngine(const CoalesceEngineParams &params);
-    // ~CoalesceEngine();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 06b5381641..d09da113ee 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -57,6 +57,19 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::startup()
+{
+    uint8_t* first_update_data = new uint8_t [4];
+    uint32_t* tempPtr = (uint32_t*) first_update_data;
+    *tempPtr = 0;
+
+    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+
+    sendPushUpdate(first_update);
+}
+
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -136,7 +149,7 @@ PushEngine::processNextAddrGenEvent()
     };
 
     for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = getReadPacket(addr_queue[index], 64, _requestorId);
+        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
@@ -182,6 +195,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
+// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
 void
 PushEngine::processNextPushEvent()
 {
@@ -196,17 +210,16 @@ PushEngine::processNextPushEvent()
     int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
         uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
+        Edge* e = (Edge*) (curr_edge_data);
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            _requestorId);
+        PacketPtr update = createUpdatePacket(e->neighbor,
+            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
+                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.
@@ -218,6 +231,23 @@ PushEngine::processNextPushEvent()
     }
 }
 
+PacketPtr
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
 bool
 PushEngine::sendPushUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index e97a26c7bd..81acc9862b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -77,6 +77,10 @@ class PushEngine : public BaseReadEngine
     int onTheFlyReadReqs;
     std::queue<PacketPtr> memRespQueue;
 
+    virtual void startup();
+
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+
     bool sendPushUpdate(PacketPtr pkt);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 517d10ef67..b874ec65ec 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -56,53 +56,6 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    //FIXME: The WLEngine no longer has a MemPort. Update this to
-    // work with the CoalesceEngine instead.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        coalesceEngine->recvFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, _requestorId);
-
-    handleIncomingUpdate(first_update);
-}
-
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
@@ -152,6 +105,7 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t* update_value = update->getPtr<uint32_t>();
 
+    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
         (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
         if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -178,6 +132,7 @@ WLEngine::processNextReadEvent()
         // TODO: Add a stat to count the number of coalescions
     }
 
+    // TODO: Only schedule nextReadEvent only when it has to be scheduled
     if ((!nextReadEvent.scheduled()) &&
         (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
@@ -208,11 +163,12 @@ WLEngine::processNextReduceEvent()
         WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.temp_prop,
+                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
                     onTheFlyUpdateMap[addr]);
         // TODO: Generalize this to reduce function rather than just min
+        wl.tempProp = std::min(update_value, wl.tempProp);
         stats.numReduce++;
-        wl.temp_prop = std::min(update_value, wl.temp_prop);
+
         coalesceEngine->recvWLWrite(addr, wl);
         servicedAddresses.push_back(addr);
         it++;
@@ -227,16 +183,15 @@ WLEngine::processNextReduceEvent()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    // TODO: Coalesce updates here too
     assert(updateQueue.size() <= updateQueueSize);
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
     updateQueue.push(pkt);
+    assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
     return true;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 891916e7af..ef18956ec1 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -73,7 +73,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
 
     std::unordered_map<Addr, WorkListItem> addrWorkListMap;
-    virtual void startup();
 
     void recvFunctional(PacketPtr pkt);
 

From aa5a5e06804582845ae1c33732d759a1d51a3ece Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 18:03:35 -0700
Subject: [PATCH 076/287] Fixing base_edge_addr in config and debugs.

---
 configs/accl/sega.py               |  6 +++---
 src/accl/graph/base/SConscript     |  1 -
 src/accl/graph/sega/push_engine.cc | 11 +++++++----
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8ea247106e..680157ba7e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,7 +4,7 @@
 class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x100000,
+        self.push_engine = PushEngine(base_edge_addr=0x80000000,
                                     push_req_queue_size = 16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
@@ -60,9 +60,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="live-journal/graph_binaries/vertices",
+            vertex_binary="epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="live-journal/graph_binaries/edgelist_0")
+            edge_binary="epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8aefca2185..ea96f4323b 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,4 +32,3 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_read_engine.cc')
 Source('base_reduce_engine.cc')
-Source('util.cc')
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d09da113ee..c305a4bbb9 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -203,23 +203,26 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
+            __func__, pkt->getAddr());
+
     Addr offset = reqOffsetMap[req];
     int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
     for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
+        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
         Edge* e = (Edge*) (curr_edge_data);
+        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
         int data_size = sizeof(uint32_t) / sizeof(uint8_t);
         uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
         // TODO: Implement propagate function here
         *update_data = value + 1;
+        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+                __func__, e->neighbor, *update_data);
         PacketPtr update = createUpdatePacket(e->neighbor,
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
-        DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e->to_string(), *update_data);
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
             memRespQueue.pop();
             // TODO: Erase map entries here.

From b8df760f0512d590c32826349e408cebe0e075bb Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 31 Mar 2022 19:00:29 -0700
Subject: [PATCH 077/287] Changing queue to deque

---
 src/accl/graph/base/base_read_engine.hh |  1 -
 src/accl/graph/sega/coalesce_engine.cc  | 22 +++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh  |  6 +++---
 src/accl/graph/sega/push_engine.cc      | 12 ++++++------
 src/accl/graph/sega/push_engine.hh      |  6 +++---
 src/accl/graph/sega/wl_engine.cc        |  6 +++---
 src/accl/graph/sega/wl_engine.hh        |  2 +-
 7 files changed, 27 insertions(+), 28 deletions(-)

diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 591b51aeb7..e21aaa01d2 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -29,7 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
 
-#include <queue>
 #include <unordered_map>
 
 #include "base/addr_range.hh"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 67874cb9b9..9fed1e8230 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,8 +85,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
         DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
                         , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push(addr);
-        worklistResponseQueue.push(cacheBlocks[block_index].items[wl_offset]);
+        addrResponseQueue.push_back(addr);
+        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -143,7 +143,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
-                    outstandingMemReqQueue.push(pkt);
+                    outstandingMemReqQueue.push_back(pkt);
 
                     stats.numVertexBlockReads++;
 
@@ -175,7 +175,7 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
-        outstandingMemReqQueue.pop();
+        outstandingMemReqQueue.pop_front();
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
@@ -192,8 +192,8 @@ CoalesceEngine::processNextRespondEvent()
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
 
-    addrResponseQueue.pop();
-    worklistResponseQueue.pop();
+    addrResponseQueue.pop_front();
+    worklistResponseQueue.pop_front();
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -234,8 +234,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
         if (alligned_miss_addr == addr) {
             int wl_offset = (miss_addr - alligned_miss_addr) / 16;
-            addrResponseQueue.push(miss_addr);
-            worklistResponseQueue.push(
+            addrResponseQueue.push_back(miss_addr);
+            worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
@@ -357,8 +357,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             // TODO: Make sure this trick works;
             Addr alligned_miss_addr = (miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
-            outstandingMemReqQueue.push(write_pkt);
-            outstandingMemReqQueue.push(read_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
+            outstandingMemReqQueue.push_back(read_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -381,7 +381,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, evictQueue.size());
         } else if ((!cacheBlocks[block_index].hasConflict) &&
             (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
-            outstandingMemReqQueue.push(write_pkt);
+            outstandingMemReqQueue.push_back(write_pkt);
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4bb21676d4..2cb9856f76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,10 +74,10 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
-    std::queue<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> outstandingMemReqQueue;
 
-    std::queue<Addr> addrResponseQueue;
-    std::queue<WorkListItem> worklistResponseQueue;
+    std::deque<Addr> addrResponseQueue;
+    std::deque<WorkListItem> worklistResponseQueue;
 
     std::deque<int> evictQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c305a4bbb9..450ba9ddc4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,7 +109,7 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push(wl);
+    pushReqQueue.push_back(wl);
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -153,10 +153,10 @@ PushEngine::processNextAddrGenEvent()
         reqOffsetMap[pkt->req] = offset_queue[index];
         reqNumEdgeMap[pkt->req] = num_edge_queue[index];
         reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push(pkt);
+        pendingReadReqs.push_back(pkt);
     }
 
-    pushReqQueue.pop();
+    pushReqQueue.pop_front();
 
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
@@ -175,7 +175,7 @@ PushEngine::processNextReadEvent()
         PacketPtr pkt = pendingReadReqs.front();
         sendMemReq(pkt);
         onTheFlyReadReqs++;
-        pendingReadReqs.pop();
+        pendingReadReqs.pop_front();
     }
 
     if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
@@ -187,7 +187,7 @@ bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
     onTheFlyReadReqs--;
-    memRespQueue.push(pkt);
+    memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
@@ -224,7 +224,7 @@ PushEngine::processNextPushEvent()
             sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
 
         if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
+            memRespQueue.pop_front();
             // TODO: Erase map entries here.
         }
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 81acc9862b..1b1a812d16 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,18 +64,18 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::queue<WorkListItem> pushReqQueue;
+    std::deque<WorkListItem> pushReqQueue;
 
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
     // TODO: Possibility of infinite queueing
-    std::queue<PacketPtr> pendingReadReqs;
+    std::deque<PacketPtr> pendingReadReqs;
 
     int memRespQueueSize;
     int onTheFlyReadReqs;
-    std::queue<PacketPtr> memRespQueue;
+    std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b874ec65ec..73eacf945f 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -115,7 +115,7 @@ WLEngine::processNextReadEvent()
             onTheFlyUpdateMap[update_addr] = *update_value;
             DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                 __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop();
+            updateQueue.pop_front();
             DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         }
     } else {
@@ -127,7 +127,7 @@ WLEngine::processNextReadEvent()
         onTheFlyUpdateMap[update_addr] =
                 std::min(*update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
-        updateQueue.pop();
+        updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
         // TODO: Add a stat to count the number of coalescions
     }
@@ -188,7 +188,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push(pkt);
+    updateQueue.push_back(pkt);
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index ef18956ec1..c1ef028f77 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -67,7 +67,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::queue<PacketPtr> updateQueue;
+    std::deque<PacketPtr> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From 2bfc6c7d5f6c2cb4911a2b72a228be95312a8dad Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 31 Mar 2022 20:25:31 -0700
Subject: [PATCH 078/287] Removing old files and renaming utils to
 data_structs.

---
 src/accl/graph/base/base_reduce_engine.hh     |   5 -
 .../graph/base/{util.hh => data_structs.hh}   |   0
 src/accl/graph/base/old/BaseApplyEngine.py    |  36 ----
 src/accl/graph/base/old/BaseEngine.py         |  39 -----
 src/accl/graph/base/old/BasePushEngine.py     |  36 ----
 src/accl/graph/base/old/BaseWLEngine.py       |  36 ----
 src/accl/graph/base/old/base_apply_engine.cc  | 137 ---------------
 src/accl/graph/base/old/base_apply_engine.hh  |  72 --------
 src/accl/graph/base/old/base_engine.cc        | 100 -----------
 src/accl/graph/base/old/base_engine.hh        |  98 -----------
 src/accl/graph/base/old/base_push_engine.cc   | 145 ----------------
 src/accl/graph/base/old/base_push_engine.hh   |  82 ---------
 src/accl/graph/base/old/base_wl_engine.cc     | 134 ---------------
 src/accl/graph/base/old/base_wl_engine.hh     |  83 ----------
 src/accl/graph/sega/coalesce_engine.hh        |   2 +-
 src/accl/graph/sega/old/ApplyEngine.py        |  38 -----
 src/accl/graph/sega/old/LockDir.py            |  46 ------
 src/accl/graph/sega/old/PushEngine.py         |  37 -----
 src/accl/graph/sega/old/WLEngine.py           |  40 -----
 src/accl/graph/sega/old/apply_engine.cc       |  58 -------
 src/accl/graph/sega/old/apply_engine.hh       |  67 --------
 src/accl/graph/sega/old/lock_dir.cc           |  63 -------
 src/accl/graph/sega/old/lock_dir.hh           |  57 -------
 src/accl/graph/sega/old/push_engine.cc        |  90 ----------
 src/accl/graph/sega/old/push_engine.hh        |  77 ---------
 src/accl/graph/sega/old/wl_engine.cc          | 156 ------------------
 src/accl/graph/sega/old/wl_engine.hh          |  86 ----------
 src/accl/graph/sega/push_engine.hh            |   2 +-
 src/accl/graph/sega/wl_engine.hh              |   3 +-
 29 files changed, 4 insertions(+), 1821 deletions(-)
 rename src/accl/graph/base/{util.hh => data_structs.hh} (100%)
 delete mode 100644 src/accl/graph/base/old/BaseApplyEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseEngine.py
 delete mode 100644 src/accl/graph/base/old/BasePushEngine.py
 delete mode 100644 src/accl/graph/base/old/BaseWLEngine.py
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_apply_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_push_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_push_engine.hh
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.cc
 delete mode 100644 src/accl/graph/base/old/base_wl_engine.hh
 delete mode 100644 src/accl/graph/sega/old/ApplyEngine.py
 delete mode 100644 src/accl/graph/sega/old/LockDir.py
 delete mode 100644 src/accl/graph/sega/old/PushEngine.py
 delete mode 100644 src/accl/graph/sega/old/WLEngine.py
 delete mode 100644 src/accl/graph/sega/old/apply_engine.cc
 delete mode 100644 src/accl/graph/sega/old/apply_engine.hh
 delete mode 100644 src/accl/graph/sega/old/lock_dir.cc
 delete mode 100644 src/accl/graph/sega/old/lock_dir.hh
 delete mode 100644 src/accl/graph/sega/old/push_engine.cc
 delete mode 100644 src/accl/graph/sega/old/push_engine.hh
 delete mode 100644 src/accl/graph/sega/old/wl_engine.cc
 delete mode 100644 src/accl/graph/sega/old/wl_engine.hh

diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index f2245f571f..c8c9784ed1 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -29,8 +29,6 @@
 #ifndef __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 #define __ACCL_GRAPH_BASE_BASE_REDUCE_ENGINE_HH__
 
-
-#include "accl/graph/base/util.hh"
 #include "params/BaseReduceEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -43,7 +41,6 @@ class BaseReduceEngine : public ClockedObject
   private:
     System* system;
 
-
   protected:
 
     const RequestorID _requestorId;
@@ -55,8 +52,6 @@ class BaseReduceEngine : public ClockedObject
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
-
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl) = 0;
 };
 
 }
diff --git a/src/accl/graph/base/util.hh b/src/accl/graph/base/data_structs.hh
similarity index 100%
rename from src/accl/graph/base/util.hh
rename to src/accl/graph/base/data_structs.hh
diff --git a/src/accl/graph/base/old/BaseApplyEngine.py b/src/accl/graph/base/old/BaseApplyEngine.py
deleted file mode 100644
index 9b240581ac..0000000000
--- a/src/accl/graph/base/old/BaseApplyEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseApplyEngine(BaseEngine):
-    abstract = True
-    type = 'BaseApplyEngine'
-    cxx_header = 'accl/graph/base/base_apply_engine.hh'
-    cxx_class = 'gem5::BaseApplyEngine'
diff --git a/src/accl/graph/base/old/BaseEngine.py b/src/accl/graph/base/old/BaseEngine.py
deleted file mode 100644
index 16c2f402e5..0000000000
--- a/src/accl/graph/base/old/BaseEngine.py
+++ /dev/null
@@ -1,39 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseEngine(ClockedObject):
-    abstract = True
-    type = 'BaseEngine'
-    cxx_header = "accl/graph/base/base_engine.hh"
-    cxx_class = 'gem5::BaseEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/old/BasePushEngine.py b/src/accl/graph/base/old/BasePushEngine.py
deleted file mode 100644
index 2163864be3..0000000000
--- a/src/accl/graph/base/old/BasePushEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BasePushEngine(BaseEngine):
-    abstract = True
-    type = 'BasePushEngine'
-    cxx_header = "accl/graph/base/base_push_engine.hh"
-    cxx_class = 'gem5::BasePushEngine'
diff --git a/src/accl/graph/base/old/BaseWLEngine.py b/src/accl/graph/base/old/BaseWLEngine.py
deleted file mode 100644
index 7311c396b3..0000000000
--- a/src/accl/graph/base/old/BaseWLEngine.py
+++ /dev/null
@@ -1,36 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseEngine import BaseEngine
-
-class BaseWLEngine(BaseEngine):
-    abstract = True
-    type = 'BaseWLEngine'
-    cxx_header = "accl/graph/base/base_wl_engine.hh"
-    cxx_class = 'gem5::BaseWLEngine'
diff --git a/src/accl/graph/base/old/base_apply_engine.cc b/src/accl/graph/base/old/base_apply_engine.cc
deleted file mode 100644
index 39f5dafc67..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.cc
+++ /dev/null
@@ -1,137 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_apply_engine.hh"
-
-#include <string>
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-
-namespace gem5
-{
-
-BaseApplyEngine::BaseApplyEngine(const BaseApplyEngineParams &params):
-    BaseEngine(params),
-    nextApplyCheckEvent([this]{ processNextApplyCheckEvent(); }, name()),
-    nextApplyEvent([this]{ processNextApplyEvent(); }, name())
-{}
-
-bool
-BaseApplyEngine::recvWLNotif(Addr addr)
-{
-    // TODO: Investigate the situation where the queue is full.
-    applyReadQueue.push(addr);
-    if (!nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-    return true;
-}
-
-void
-BaseApplyEngine::processNextApplyCheckEvent()
-{
-    // TODO: We might want to change the way this function
-    // pops items off queue, maybe we should pop every n cycles
-    // or change the clock domain for this simobject.
-    Addr addr = applyReadQueue.front();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = (addr % 64);
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffset[memPkt->req] = req_offset;
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            applyReadQueue.pop();
-        }
-    }
-    if (!applyReadQueue.empty() &&  !nextApplyCheckEvent.scheduled()){
-        schedule(nextApplyCheckEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::processNextApplyEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    RequestPtr request = pkt->req;
-    Addr request_offset = requestOffset[request];
-
-    WorkListItem wl = memoryToWorkList(data + request_offset);
-    DPRINTF(MPU, "%s: Apply Engine is reading WorkList Item[%lu]: %s\n"
-                , __func__, pkt->getAddr() + request_offset, wl.to_string());
-    // FIXME: Not so much of a fixme. However, why do we fwd a worklistitem
-    // to applyengine if temp_prop < prop. If temp_prop has not changed, why
-    // fwd it to applyengine?
-    if (wl.temp_prop < wl.prop) {
-        // TODO: instead of min add a Reduce function.
-        //update prop with temp_prop
-        wl.prop = wl.temp_prop;
-        //write back the new worklist item to  memory
-        uint8_t* wList = workListToMemory(wl);
-        memcpy(data + request_offset, wList, sizeof(WorkListItem));
-        //Create memory write requests.
-        PacketPtr writePkt  =
-        getWritePacket(pkt->getAddr(), 64, data, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-
-        if (!memPortBlocked()) {
-            if (sendApplyNotif(wl.prop, wl.degree, wl.edgeIndex)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The Apply Engine is applying the new value into WorkList Item[%lu]: %s\n"
-                              , __func__, pkt->getAddr() + request_offset, wl.to_string());
-            }
-        }
-    } else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(pkt->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextApplyEvent.scheduled() && !memRespQueue.empty()){
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-void
-BaseApplyEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextApplyEvent.scheduled()) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_apply_engine.hh b/src/accl/graph/base/old/base_apply_engine.hh
deleted file mode 100644
index f4df298079..0000000000
--- a/src/accl/graph/base/old/base_apply_engine.hh
+++ /dev/null
@@ -1,72 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BaseApplyEngine.hh"
-
-namespace gem5
-{
-
-class BaseApplyEngine : public BaseEngine
-{
-  private:
-    std::queue<Addr> applyReadQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffset;
-
-    EventFunctionWrapper nextApplyCheckEvent;
-    void processNextApplyCheckEvent();
-
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-            uint32_t degree, uint32_t edgeIndex) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-    PARAMS(BaseApplyEngine);
-
-    BaseApplyEngine(const BaseApplyEngineParams &apply);
-
-    bool recvWLNotif(Addr addr);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_engine.cc b/src/accl/graph/base/old/base_engine.cc
deleted file mode 100644
index ad87bb3662..0000000000
--- a/src/accl/graph/base/old/base_engine.cc
+++ /dev/null
@@ -1,100 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-BaseEngine::BaseEngine(const BaseEngineParams &params) :
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".memPort", this),
-    requestorId(system->getRequestorId(this))
-{
-    DPRINTF(MPU, "%s: My requestorId is %u,\n", __func__, requestorId);
-}
-
-BaseEngine::~BaseEngine()
-{}
-
-Port&
-BaseEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
-
-}
-
-void
-BaseEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-BaseEngine::handleMemResp(PacketPtr pkt)
-{
-    if (pkt->isResponse() && pkt->isWrite()) {
-        return true;
-    }
-    memRespQueue.push(pkt);
-    scheduleMainEvent();
-    return true;
-}
-
-}
diff --git a/src/accl/graph/base/old/base_engine.hh b/src/accl/graph/base/old/base_engine.hh
deleted file mode 100644
index 53415ddc7c..0000000000
--- a/src/accl/graph/base/old/base_engine.hh
+++ /dev/null
@@ -1,98 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    bool handleMemResp(PacketPtr resp);
-
-  protected:
-    const RequestorID requestorId;
-    // TODO: Add this later, maybe?
-    // int memRespQueueSize;
-    std::queue<PacketPtr> memRespQueue;
-
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    virtual void scheduleMainEvent() = 0;
-
-  public:
-    PARAMS(BaseEngine);
-
-    BaseEngine(const BaseEngineParams &params);
-    ~BaseEngine();
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_push_engine.cc b/src/accl/graph/base/old/base_push_engine.cc
deleted file mode 100644
index 4ebe40e486..0000000000
--- a/src/accl/graph/base/old/base_push_engine.cc
+++ /dev/null
@@ -1,145 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_push_engine.hh"
-
-#include "accl/graph/base/util.hh"
-#include "debug/MPU.hh"
-
-namespace gem5
-{
-
-BasePushEngine::BasePushEngine(const BasePushEngineParams &params) :
-    BaseEngine(params),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
-{}
-
-bool
-BasePushEngine::recvApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edge_index)
-{
-    notifQueue.emplace(prop, degree, edge_index);
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    DPRINTF(MPU, "%s: Reading %d edges.", __func__, degree);
-    return true;
-}
-
-void
-BasePushEngine::processNextReadEvent()
-{
-    ApplyNotif notif = notifQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < notif.degree; index++) {
-        // FIXME: For now the base edge address is 1048576
-        Addr edge_addr = 1048576 + (notif.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        if (!memPortBlocked()) {
-            PacketPtr pkt = getReadPacket(addr_queue[index], 64, requestorId);
-            reqOffsetMap[pkt->req] = offset_queue[index];
-            reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-            reqValueMap[pkt->req] = notif.prop;
-            sendMemReq(pkt);
-            notifQueue.pop();
-        }
-    }
-
-    if (!nextReadEvent.scheduled() && !notifQueue.empty()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::processNextPushEvent()
-{
-    PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
-    uint32_t value = reqValueMap[req];
-
-    int edge_in_bytes = sizeof(Edge) / sizeof(uint8_t);
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * edge_in_bytes);
-        Edge e = memoryToEdge(curr_edge_data);
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        PacketPtr update = getUpdatePacket(e.neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data,
-            requestorId);
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop();
-            DPRINTF(MPU, "%s: Reading  %s, updating with %d\n"
-                , __func__, e.to_string(), *update_data);
-            // TODO: Erase map entries here.
-        }
-    }
-
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-void
-BasePushEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextPushEvent.scheduled()) {
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
-}
diff --git a/src/accl/graph/base/old/base_push_engine.hh b/src/accl/graph/base/old/base_push_engine.hh
deleted file mode 100644
index 01027d2791..0000000000
--- a/src/accl/graph/base/old/base_push_engine.hh
+++ /dev/null
@@ -1,82 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
-
-#include <queue>
-
-#include "accl/graph/base/base_engine.hh"
-#include "mem/request.hh"
-#include "params/BasePushEngine.hh"
-
-namespace gem5
-{
-
-class BasePushEngine : public BaseEngine
-{
-  private:
-    struct ApplyNotif {
-        uint32_t prop;
-        uint32_t degree;
-        uint32_t edgeIndex;
-
-        ApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index):
-        prop(prop), degree(degree), edgeIndex(edge_index)
-        {}
-    };
-
-    std::queue<ApplyNotif> notifQueue;
-    // int notifQueueSize;
-
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
-
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
-    EventFunctionWrapper nextPushEvent;
-    void processNextPushEvent();
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BasePushEngine);
-
-    BasePushEngine(const BasePushEngineParams &params);
-
-    bool recvApplyNotif(uint32_t prop, uint32_t degree, uint32_t edge_index);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/base/old/base_wl_engine.cc b/src/accl/graph/base/old/base_wl_engine.cc
deleted file mode 100644
index fd45b85077..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.cc
+++ /dev/null
@@ -1,134 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "debug/MPU.hh"
-
-#include <string>
-
-namespace gem5
-{
-
-BaseWLEngine::BaseWLEngine(const BaseWLEngineParams &params):
-    BaseEngine(params),
-    nextWLReadEvent([this]{ processNextWLReadEvent(); }, name()),
-    nextWLReduceEvent([this]{ processNextWLReduceEvent(); }, name())
-{}
-
-bool
-BaseWLEngine::handleWLUpdate(PacketPtr pkt)
-{
-    updateQueue.push(pkt);
-    if(!nextWLReadEvent.scheduled()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-    return true;
-}
-
-void BaseWLEngine::processNextWLReadEvent()
-{
-    PacketPtr pkt = updateQueue.front();
-    uint32_t value = *(pkt->getPtr<uint32_t>());
-
-    Addr addr = pkt->getAddr();
-    Addr req_addr = (addr / 64) * 64;
-    Addr req_offset = addr % 64;
-
-    if (acquireAddress(req_addr)) {
-        PacketPtr memPkt = getReadPacket(req_addr, 64, requestorId);
-        requestOffsetMap[memPkt->req] = req_offset;
-        requestValueMap[memPkt->req] = value;
-
-        if (!memPortBlocked()) {
-            sendMemReq(memPkt);
-            updateQueue.pop();
-        }
-        else{
-            releaseAddress(req_addr);
-        }
-    }
-    if (!nextWLReadEvent.scheduled() && !updateQueue.empty()) {
-        schedule(nextWLReadEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::processNextWLReduceEvent()
-{
-    PacketPtr resp = memRespQueue.front();
-    uint8_t* respData = resp->getPtr<uint8_t>();
-    Addr request_offset = requestOffsetMap[resp->req];
-    uint32_t value = requestValueMap[resp->req];
-    WorkListItem wl =  memoryToWorkList(respData + request_offset);
-
-    DPRINTF(MPU, "%s: The WLE is reading WorkList item [%lu]: %s %d\n"
-                , __func__, resp->getAddr() + request_offset, wl.to_string(), value);
-    if (value < wl.temp_prop){
-        //update prop with temp_prop
-        wl.temp_prop = value;
-
-        uint8_t* wlData = workListToMemory(wl);
-        memcpy(respData + request_offset, wlData, sizeof(WorkListItem));
-        PacketPtr writePkt  =
-        getWritePacket(resp->getAddr(), 64, respData, requestorId);
-
-        DPRINTF(MPU, "%s: Sending a pkt with this info. "
-                "pkt->addr: %lu, pkt->size: %lu\npkt->data: %s\n",
-                __func__, writePkt->getAddr(),
-                writePkt->getSize(), writePkt->printData());
-        if (!memPortBlocked()) {
-            if (sendWLNotif(resp->getAddr() + request_offset)) {
-                sendMemReq(writePkt);
-                memRespQueue.pop();
-                DPRINTF(MPU, "%s: The WLE is changing to: %s\n"
-                , __func__, wl.to_string());
-                // TODO: Erase map entries, delete wlData;
-            }
-        }
-    }
-    else {
-        memRespQueue.pop();
-    }
-    if (!releaseAddress(resp->getAddr())) {
-        panic("Could not release an address");
-    }
-    if (!nextWLReduceEvent.scheduled() && !memRespQueue.empty()){
-            schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-void
-BaseWLEngine::scheduleMainEvent()
-{
-    if (!memRespQueue.empty() && !nextWLReduceEvent.scheduled()) {
-        schedule(nextWLReduceEvent, nextCycle());
-    }
-}
-
-
-}
diff --git a/src/accl/graph/base/old/base_wl_engine.hh b/src/accl/graph/base/old/base_wl_engine.hh
deleted file mode 100644
index 15371f965b..0000000000
--- a/src/accl/graph/base/old/base_wl_engine.hh
+++ /dev/null
@@ -1,83 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_engine.hh"
-#include "accl/graph/base/util.hh"
-#include "params/BaseWLEngine.hh"
-
-namespace gem5
-{
-
-class BaseWLEngine : public BaseEngine
-{
-  private:
-    std::queue<PacketPtr> updateQueue;
-    std::queue<PacketPtr> responseQueue;
-
-    std::unordered_map<RequestPtr, Addr> requestOffsetMap;
-    std::unordered_map<RequestPtr, uint32_t> requestValueMap;
-
-    //Events
-    EventFunctionWrapper nextWLReadEvent;
-    void processNextWLReadEvent();
-    /* Syncronously checked
-       If there are any active vertecies:
-       create memory read packets + MPU::MPU::MemPortsendTimingReq
-    */
-
-    EventFunctionWrapper nextWLReduceEvent;
-    void processNextWLReduceEvent();
-    /* Activated by MPU::MPUMemPort::recvTimingResp and handleMemResp
-       Perform apply and send the write request and read edgeList
-       read + write
-       Write edgelist loc in buffer
-    */
-  protected:
-    virtual bool sendWLNotif(Addr addr) = 0;
-    virtual bool acquireAddress(Addr addr) = 0;
-    virtual bool releaseAddress(Addr addr) = 0;
-    virtual void scheduleMainEvent() override;
-
-  public:
-
-    PARAMS(BaseWLEngine);
-
-    BaseWLEngine(const BaseWLEngineParams &params);
-
-    bool handleWLUpdate(PacketPtr pkt);
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2cb9856f76..ff30efde4c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
diff --git a/src/accl/graph/sega/old/ApplyEngine.py b/src/accl/graph/sega/old/ApplyEngine.py
deleted file mode 100644
index 7a446bb620..0000000000
--- a/src/accl/graph/sega/old/ApplyEngine.py
+++ /dev/null
@@ -1,38 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseApplyEngine import BaseApplyEngine
-
-class ApplyEngine(BaseApplyEngine):
-    type = 'ApplyEngine'
-    cxx_header = "accl/graph/sega/apply_engine.hh"
-    cxx_class = 'gem5::ApplyEngine'
-
-    push_engine = Param.PushEngine(Parent.any, "MPU object that owns this ApplyEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/LockDir.py b/src/accl/graph/sega/old/LockDir.py
deleted file mode 100644
index d21963dc3a..0000000000
--- a/src/accl/graph/sega/old/LockDir.py
+++ /dev/null
@@ -1,46 +0,0 @@
-# Copyright (c) 2012-2014, 2017-2018 ARM Limited
-# All rights reserved.
-#
-# The license below extends only to copyright in the software and shall
-# not be construed as granting a license to any other intellectual
-# property including but not limited to intellectual property relating
-# to a hardware implementation of the functionality of the software
-# licensed hereunder.  You may use the software subject to the license
-# terms below provided that you ensure that this notice is replicated
-# unmodified and in its entirety in all distributions of the software,
-# modified or unmodified, in source code or in binary form.
-#
-# Copyright (c) 2007 The Regents of The University of Michigan
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.SimObject import SimObject
-
-class LockDirectory(SimObject):
-    type = 'LockDirectory'
-    cxx_header = 'accl/graph/sega/lock_dir.hh'
-    cxx_class = 'gem5::LockDirectory'
diff --git a/src/accl/graph/sega/old/PushEngine.py b/src/accl/graph/sega/old/PushEngine.py
deleted file mode 100644
index a743b57262..0000000000
--- a/src/accl/graph/sega/old/PushEngine.py
+++ /dev/null
@@ -1,37 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BasePushEngine import BasePushEngine
-
-class PushEngine(BasePushEngine):
-    type = 'PushEngine'
-    cxx_header = "accl/graph/sega/push_engine.hh"
-    cxx_class = 'gem5::PushEngine'
-
-    req_port  = RequestPort("Port to send updates to the outside")
diff --git a/src/accl/graph/sega/old/WLEngine.py b/src/accl/graph/sega/old/WLEngine.py
deleted file mode 100644
index b6e697266e..0000000000
--- a/src/accl/graph/sega/old/WLEngine.py
+++ /dev/null
@@ -1,40 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseWLEngine import BaseWLEngine
-
-class WLEngine(BaseWLEngine):
-    type = 'WLEngine'
-    cxx_header = "accl/graph/sega/wl_engine.hh"
-    cxx_class = 'gem5::WLEngine'
-
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    apply_engine = Param.ApplyEngine(Parent.any,
-            "MPU object that owns this WLEngine")
-    lock_dir = Param.LockDirectory(NULL, "The lock directory to acquire locks from")
diff --git a/src/accl/graph/sega/old/apply_engine.cc b/src/accl/graph/sega/old/apply_engine.cc
deleted file mode 100644
index 544bb082ad..0000000000
--- a/src/accl/graph/sega/old/apply_engine.cc
+++ /dev/null
@@ -1,58 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/apply_engine.hh"
-
-namespace gem5{
-
-ApplyEngine::ApplyEngine(const ApplyEngineParams &params) :
-    BaseApplyEngine(params),
-    pushEngine(params.push_engine),
-    lockDir(params.lock_dir)
-{}
-
-bool
-ApplyEngine::sendApplyNotif(uint32_t prop, uint32_t degree, uint32_t edgeIndex)
-{
-    return pushEngine->recvApplyNotif(prop, degree, edgeIndex);
-
-}
-
-bool
-ApplyEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-ApplyEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/apply_engine.hh b/src/accl/graph/sega/old/apply_engine.hh
deleted file mode 100644
index c88330487a..0000000000
--- a/src/accl/graph/sega/old/apply_engine.hh
+++ /dev/null
@@ -1,67 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "accl/graph/sega/push_engine.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/ApplyEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/port.hh"
-
-namespace gem5
-{
-
-
-class ApplyEngine : public BaseApplyEngine
-{
-  private:
-    PushEngine* pushEngine;
-    LockDirectory* lockDir;
-
-  protected:
-    virtual bool sendApplyNotif(uint32_t prop,
-        uint32_t degree, uint32_t edgeIndex) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(ApplyEngine);
-    ApplyEngine(const ApplyEngineParams &params);
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_APPLY_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/lock_dir.cc b/src/accl/graph/sega/old/lock_dir.cc
deleted file mode 100644
index 6a4496175d..0000000000
--- a/src/accl/graph/sega/old/lock_dir.cc
+++ /dev/null
@@ -1,63 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/lock_dir.hh"
-
-namespace gem5
-{
-
-LockDirectory::LockDirectory(const LockDirectoryParams &params) :
-    SimObject(params)
-{}
-
-bool
-LockDirectory::acquire(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        lockOwnerMap[addr] = requestorId;
-        return true;
-    } else {
-        return false;
-    }
-}
-
-bool
-LockDirectory::release(Addr addr, RequestorID requestorId)
-{
-    if (lockOwnerMap.find(addr) == lockOwnerMap.end()) {
-        panic("Should not relase an address before acquiring");
-    } else if (lockOwnerMap[addr] != requestorId) {
-        panic("Should not release and address you don't own");
-    } else {
-        lockOwnerMap.erase(addr);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/lock_dir.hh b/src/accl/graph/sega/old/lock_dir.hh
deleted file mode 100644
index 012334ce43..0000000000
--- a/src/accl/graph/sega/old/lock_dir.hh
+++ /dev/null
@@ -1,57 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-#define __ACCL_GRAPH_SEGA_LOCK_DIR_HH__
-
-#include <unordered_map>
-
-#include "mem/packet.hh"
-#include "params/LockDirectory.hh"
-#include "sim/sim_object.hh"
-
-namespace gem5
-{
-
-class LockDirectory: public SimObject
-{
-  private:
-    std::unordered_map<Addr, RequestorID> lockOwnerMap;
-    // std::unordered_map<Addr, int> lockDegreeMap;
-
-  public:
-    PARAMS(LockDirectory);
-    LockDirectory(const LockDirectoryParams &params);
-
-    bool acquire(Addr addr, RequestorID requestorId);
-    bool release(Addr addr, RequestorID requestorId);
-};
-
-}
-
-#endif
diff --git a/src/accl/graph/sega/old/push_engine.cc b/src/accl/graph/sega/old/push_engine.cc
deleted file mode 100644
index c7b229ad33..0000000000
--- a/src/accl/graph/sega/old/push_engine.cc
+++ /dev/null
@@ -1,90 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/push_engine.hh"
-
-namespace gem5
-{
-
-PushEngine::PushEngine(const PushEngineParams &params) :
-    BasePushEngine(params),
-    reqPort(name() + "reqPort", this)
-{}
-
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BasePushEngine::getPort(if_name, idx);
-    }
-}
-
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-}
-
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
-}
diff --git a/src/accl/graph/sega/old/push_engine.hh b/src/accl/graph/sega/old/push_engine.hh
deleted file mode 100644
index 604df4750d..0000000000
--- a/src/accl/graph/sega/old/push_engine.hh
+++ /dev/null
@@ -1,77 +0,0 @@
-/*
- * Copyright (c) 2021 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
-
-#include "accl/graph/base/base_push_engine.hh"
-#include "params/PushEngine.hh"
-
-namespace gem5
-{
-
-class MPU;
-
-class PushEngine : public BasePushEngine
-{
-  private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
-  protected:
-    virtual bool sendPushUpdate(PacketPtr pkt) override;
-
-  public:
-    PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
diff --git a/src/accl/graph/sega/old/wl_engine.cc b/src/accl/graph/sega/old/wl_engine.cc
deleted file mode 100644
index 03f74f1019..0000000000
--- a/src/accl/graph/sega/old/wl_engine.cc
+++ /dev/null
@@ -1,156 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/wl_engine.hh"
-#include "debug/MPU.hh"
-namespace gem5
-{
-
-WLEngine::WLEngine(const WLEngineParams &params):
-    BaseWLEngine(params),
-    respPort(name() + ".respPort", this),
-    applyEngine(params.apply_engine),
-    lockDir(params.lock_dir)
-{}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseWLEngine::getPort(if_name, idx);
-    }
-}
-
-void
-WLEngine::startup()
-{
-    //FIXME: This is the current version of our initializer.
-    // This should be updated in the future.
-    WorkListItem vertices [5] = {
-                                {10000, 10000, 3, 0}, // Addr: 0
-                                {10000, 10000, 1, 3}, // Addr: 16
-                                {10000, 10000, 1, 4}, // Addr: 32
-                                {10000, 10000, 1, 5}, // Addr: 48
-                                {10000, 10000, 0, 6}  // Addr: 64
-                                };
-    Edge edges [7] = {
-                    {0, 16}, // Addr: 1048576
-                    {0, 32}, // Addr: 1048592
-                    {0, 48}, // Addr: 1048608
-                    {0, 32}, // Addr: 1048624
-                    {0, 64},  // Addr: 1048640
-                    {0, 32}
-                    };
-
-    for (int i = 0; i < 5; i++) {
-        uint8_t* data = workListToMemory(vertices[i]);
-        PacketPtr pkt = getWritePacket(0 + i * sizeof(WorkListItem),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    for (int i = 0; i < 7; i++) {
-        uint8_t* data = edgeToMemory(edges[i]);
-        PacketPtr pkt = getWritePacket(1048576 + i * sizeof(Edge),
-                                        16, data, 0);
-        sendMemFunctional(pkt);
-    }
-
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    PacketPtr first_update = getUpdatePacket(
-        0, 4, first_update_data, requestorId);
-
-    handleWLUpdate(first_update);
-}
-
-bool
-WLEngine::sendWLNotif(Addr addr){
-    return applyEngine->recvWLNotif(addr);
-}
-
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    return owner->handleWLUpdate(pkt);
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    // FIXME: This needs to be fixed
-    // if (pkt->cmd == MemCmd::UpdateWL) {
-    //     panic("Functional requests should not be made to WL.");
-    //     //TODO: Might be a good idea to implement later.
-    //     // wlEngine->recvFunctional(pkt);
-    // } else {
-        sendMemFunctional(pkt);
-    // }
-}
-
-bool
-WLEngine::acquireAddress(Addr addr)
-{
-    return lockDir->acquire(addr, requestorId);
-}
-
-bool
-WLEngine::releaseAddress(Addr addr)
-{
-    return lockDir->release(addr, requestorId);
-}
-
-}
diff --git a/src/accl/graph/sega/old/wl_engine.hh b/src/accl/graph/sega/old/wl_engine.hh
deleted file mode 100644
index 4e8a25795a..0000000000
--- a/src/accl/graph/sega/old/wl_engine.hh
+++ /dev/null
@@ -1,86 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
-
-#include <queue>
-#include <unordered_map>
-
-#include "accl/graph/base/base_wl_engine.hh"
-#include "accl/graph/sega/apply_engine.hh"
-#include "accl/graph/sega/lock_dir.hh"
-#include "params/WLEngine.hh"
-
-namespace gem5
-{
-
-class ApplyEngine;
-
-class WLEngine : public BaseWLEngine
-{
-  private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    RespPort respPort;
-    ApplyEngine* applyEngine;
-    LockDirectory* lockDir;
-
-    virtual void startup();
-    void recvFunctional(PacketPtr pkt);
-
-  protected:
-    virtual bool sendWLNotif(Addr addr) override;
-    virtual bool acquireAddress(Addr addr) override;
-    virtual bool releaseAddress(Addr addr) override;
-
-  public:
-    PARAMS(WLEngine);
-    WLEngine(const WLEngineParams &params);
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-};
-
-}
-#endif // __ACCL_GRAPH_SEGA_WL_ENGINE_HH__
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1b1a812d16..4c9822345f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -30,7 +30,7 @@
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
 #include "accl/graph/base/base_read_engine.hh"
-#include "accl/graph/base/util.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index c1ef028f77..a8dff32d44 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -108,7 +109,7 @@ class WLEngine : public BaseReduceEngine
 
     bool handleIncomingUpdate(PacketPtr pkt);
 
-    virtual void handleIncomingWL(Addr addr, WorkListItem wl);
+    void handleIncomingWL(Addr addr, WorkListItem wl);
 };
 
 }

From 2d18a7b77fb6e0bdbb5d9fae5ef92ee9a3181311 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 1 Apr 2022 11:07:05 -0700
Subject: [PATCH 079/287] Fixing bugs.

---
 configs/accl/sega.py               |   9 +--
 src/accl/graph/sega/push_engine.cc | 110 +++++++++++++++--------------
 src/accl/graph/sega/push_engine.hh |   6 +-
 src/accl/graph/sega/wl_engine.cc   |  23 +++---
 4 files changed, 78 insertions(+), 70 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 680157ba7e..a0c7766fe0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,11 +5,12 @@ class MPU(SubSystem):
     def __init__(self):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=0x80000000,
-                                    push_req_queue_size = 16)
+                                    push_req_queue_size=16,
+                                    mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size = 16,
+                                    update_queue_size=16,
                                     on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
@@ -60,9 +61,9 @@ def __init__(self):
         self.mpu = MPU()
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="epinions/graph_binaries/vertices",
+            vertex_binary="facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="epinions/graph_binaries/edgelist_0")
+            edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setMemPort(self.mem_ctrl.getPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 450ba9ddc4..0b4c981d48 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -65,6 +65,7 @@ PushEngine::startup()
     *tempPtr = 0;
 
     PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -109,7 +110,11 @@ PushEngine::recvWLItem(WorkListItem wl)
         return false;
     }
 
-    pushReqQueue.push_back(wl);
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t update_value = wl.prop;
+    pushReqQueue.push_back(
+        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
 
     if ((!nextAddrGenEvent.scheduled()) &&
         (!pushReqQueue.empty())) {
@@ -121,43 +126,36 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    WorkListItem wl = pushReqQueue.front();
-
-    std::vector<Addr> addr_queue;
-    std::vector<Addr> offset_queue;
-    std::vector<int> num_edge_queue;
-
-    for (uint32_t index = 0; index < wl.degree; index++) {
-        Addr edge_addr = baseEdgeAddr + (wl.edgeIndex + index) * sizeof(Edge);
-        Addr req_addr = (edge_addr / 64) * 64;
-        Addr req_offset = edge_addr % 64;
-        if (addr_queue.size()) {
-            if (addr_queue.back() == req_addr) {
-                num_edge_queue.back()++;
-            }
-            else {
-                addr_queue.push_back(req_addr);
-                offset_queue.push_back(req_offset);
-                num_edge_queue.push_back(1);
-            }
-        }
-        else {
-            addr_queue.push_back(req_addr);
-            offset_queue.push_back(req_offset);
-            num_edge_queue.push_back(1);
-        }
-    };
-
-    for (int index = 0; index < addr_queue.size(); index++) {
-        PacketPtr pkt = createReadPacket(addr_queue[index], 64);
-        reqOffsetMap[pkt->req] = offset_queue[index];
-        reqNumEdgeMap[pkt->req] = num_edge_queue[index];
-        reqValueMap[pkt->req] = wl.prop;
-        pendingReadReqs.push_back(pkt);
+    Addr start_addr, end_addr;
+    uint32_t update_value;
+
+    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
+    std::tie(start_addr, end_addr) = front.first;
+    update_value = front.second;
+
+    Addr req_addr = (start_addr / 64) * 64;
+    Addr req_offset = start_addr % 64;
+    int num_edges = 0;
+
+    if (end_addr > req_addr + 64) {
+        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
+    } else {
+        num_edges = (end_addr - start_addr) / sizeof(Edge);
     }
+    PacketPtr pkt = createReadPacket(req_addr, 64);
+    reqOffsetMap[pkt->req] = req_offset;
+    reqNumEdgeMap[pkt->req] = num_edges;
+    reqValueMap[pkt->req] = update_value;
+    pendingReadReqs.push_back(pkt);
 
     pushReqQueue.pop_front();
 
+    if (req_addr + 64 < end_addr) {
+        pushReqQueue.push_front(
+        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
+        );
+    }
+
     if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
@@ -207,26 +205,30 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr());
 
     Addr offset = reqOffsetMap[req];
-    int num_edges = reqNumEdgeMap[req];
     uint32_t value = reqValueMap[req];
 
-    for (int i = 0; i < num_edges; i++) {
-        uint8_t *curr_edge_data = data + offset + (i * sizeof(Edge));
-        Edge* e = (Edge*) (curr_edge_data);
-        DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-        int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-        uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
-        // TODO: Implement propagate function here
-        *update_data = value + 1;
-        DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-                __func__, e->neighbor, *update_data);
-        PacketPtr update = createUpdatePacket(e->neighbor,
-            sizeof(uint32_t) / sizeof(uint8_t), (uint8_t*) update_data);
-
-        if (sendPushUpdate(update) && (i == num_edges - 1)) {
-            memRespQueue.pop_front();
-            // TODO: Erase map entries here.
-        }
+    Edge* e = (Edge*) (data + offset);
+    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
+    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+    // TODO: Implement propagate function here
+    *update_data = value + 1;
+    // uint32_t update_value = value + 1;
+    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
+            __func__, e->neighbor, *update_data);
+    PacketPtr update = createUpdatePacket(e->neighbor,
+                        sizeof(uint32_t), (uint8_t*) update_data);
+
+    if (sendPushUpdate(update)) {
+        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
+        reqNumEdgeMap[req]--;
+    }
+
+    if (reqNumEdgeMap[req] == 0) {
+        memRespQueue.pop_front();
+        reqOffsetMap.erase(req);
+        reqNumEdgeMap.erase(req);
+        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -235,7 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -247,6 +250,7 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t *data)
 
     pkt->allocate();
     pkt->setData(data);
+    // pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4c9822345f..faee5128b7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -64,8 +64,9 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<WorkListItem> pushReqQueue;
+    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
 
+    // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
@@ -79,7 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t *data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 73eacf945f..117abb61e8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -106,17 +106,18 @@ WLEngine::processNextReadEvent()
     uint32_t* update_value = update->getPtr<uint32_t>();
 
     // FIXME: else logic is wrong
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end()) &&
-        (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize)) {
-        if (coalesceEngine->recvReadAddr(update_addr)) {
-            DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                            "update_addr: %lu, update_value: %u.\n",
-                            __func__, update_addr, *update_value);
-            onTheFlyUpdateMap[update_addr] = *update_value;
-            DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-            updateQueue.pop_front();
-            DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
+                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
+                                "update_addr: %lu, update_value: %u.\n",
+                                __func__, update_addr, *update_value);
+                onTheFlyUpdateMap[update_addr] = *update_value;
+                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                updateQueue.pop_front();
+                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+            }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min

From cf001ea9840f11ad2d78fa73c83cb5100039819a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 15:39:56 -0700
Subject: [PATCH 080/287] Updating createUpdatePacket.

---
 src/accl/graph/TODO.md                 |  8 ++++++++
 src/accl/graph/sega/coalesce_engine.cc | 17 ++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 26 +++++++++++++-------------
 src/accl/graph/sega/push_engine.hh     |  4 ++--
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++------
 6 files changed, 36 insertions(+), 34 deletions(-)
 create mode 100644 src/accl/graph/TODO.md

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
new file mode 100644
index 0000000000..d5effbeb96
--- /dev/null
+++ b/src/accl/graph/TODO.md
@@ -0,0 +1,8 @@
+# TODO Items
+
+* use setLE/setBE inside createUpdatePacket and createWritePacket
+* parameterize cache size, associativity, maybe latencies,
+and memory atom size in the coalesce engine
+* look at all the simobjects and come up with a general architecture. Make
+sure all the simobjects follow that architecture.
+* implement all the communications between simobjects as req/retry.
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 9fed1e8230..8d97fffd20 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -300,19 +301,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
-    bool found = false;
     if ((cacheBlocks[block_index].takenMask == 0)) {
-        for (auto index : evictQueue) {
-            if (block_index == index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            evictQueue.push_back(block_index);
-        }
-        DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
+        evictQueue.push_back(block_index);
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
@@ -328,6 +318,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
+    uint8_t* wl_data;
     uint8_t data[64];
 
     for (int i = 0; i < 4; i++) {
@@ -341,7 +332,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
                     "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
                     i, cacheBlocks[block_index].items[i].to_string());
-        uint8_t* wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
+        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ff30efde4c..5c4e752cbf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,6 +84,7 @@ class CoalesceEngine : public BaseReadEngine
     virtual void startup();
 
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0b4c981d48..870b32f2fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -64,8 +65,8 @@ PushEngine::startup()
     uint32_t* tempPtr = (uint32_t*) first_update_data;
     *tempPtr = 0;
 
-    PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    // PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
+    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
 
     sendPushUpdate(first_update);
 }
@@ -193,7 +194,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// FIXME: FIX THIS FUNCTION FOR TIMING AND FUNCTIONAL ACCURACY.
+// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPushEvent()
 {
@@ -209,15 +210,14 @@ PushEngine::processNextPushEvent()
 
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
-    int data_size = sizeof(uint32_t) / sizeof(uint8_t);
-    uint32_t* update_data = (uint32_t*) (new uint8_t [data_size]);
+
     // TODO: Implement propagate function here
-    *update_data = value + 1;
-    // uint32_t update_value = value + 1;
+    uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, *update_data);
+            __func__, e->neighbor, update_value);
+
     PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), (uint8_t*) update_data);
+                        sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
@@ -237,8 +237,8 @@ PushEngine::processNextPushEvent()
 }
 
 PacketPtr
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
+PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -249,8 +249,8 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
     PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
 
     pkt->allocate();
-    pkt->setData(data);
-    // pkt->setLE<uint32_t>(value);
+    // pkt->setData(data);
+    pkt->setLE<uint32_t>(value);
 
     return pkt;
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index faee5128b7..a539079ede 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -80,8 +80,8 @@ class PushEngine : public BaseReadEngine
 
     virtual void startup();
 
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
+    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
+    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
 
     bool sendPushUpdate(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 117abb61e8..3a6911c1bf 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -27,7 +27,9 @@
  */
 
 #include "accl/graph/sega/wl_engine.hh"
+
 #include "debug/MPU.hh"
+#include "mem/packet_access.hh"
 
 namespace gem5
 {
@@ -103,7 +105,7 @@ WLEngine::processNextReadEvent()
 {
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
-    uint32_t* update_value = update->getPtr<uint32_t>();
+    uint32_t update_value = update->getLE<uint32_t>();
 
     // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
@@ -111,8 +113,8 @@ WLEngine::processNextReadEvent()
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
                                 "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, *update_value);
-                onTheFlyUpdateMap[update_addr] = *update_value;
+                                __func__, update_addr, update_value);
+                onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
@@ -123,10 +125,10 @@ WLEngine::processNextReadEvent()
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
                             "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, *update_value,
+                            __func__, update_addr, update_value,
                             onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
-                std::min(*update_value, onTheFlyUpdateMap[update_addr]);
+                std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
         DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
@@ -154,7 +156,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-
     std::unordered_map<Addr, WorkListItem>::iterator it =
                     addrWorkListMap.begin();
 
@@ -190,6 +191,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
+
     assert(!updateQueue.empty());
     DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {

From c405e30aacbebd410d24fc83924f9769ea8e74f9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 3 Apr 2022 17:26:15 -0700
Subject: [PATCH 081/287] Adding retry to wle respPort and debug.

---
 src/accl/graph/sega/push_engine.cc | 13 +++++++++----
 src/accl/graph/sega/wl_engine.cc   | 31 +++++++++++++++++++++++++-----
 src/accl/graph/sega/wl_engine.hh   |  3 +++
 3 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 870b32f2fb..70d6242f5b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -95,10 +95,12 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
+    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+
     _blocked = false;
     sendPacket(blockedPacket);
 
-    if (!blocked()) {
+    if (!_blocked) {
         blockedPacket = nullptr;
     }
 }
@@ -202,12 +204,13 @@ PushEngine::processNextPushEvent()
     RequestPtr req = pkt->req;
     uint8_t *data = pkt->getPtr<uint8_t>();
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu.\n",
-            __func__, pkt->getAddr());
-
     Addr offset = reqOffsetMap[req];
     uint32_t value = reqValueMap[req];
 
+    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+                "offset: %lu\n",
+            __func__, pkt->getAddr(), offset);
+
     Edge* e = (Edge*) (data + offset);
     DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
@@ -220,6 +223,8 @@ PushEngine::processNextPushEvent()
                         sizeof(uint32_t), update_value);
 
     if (sendPushUpdate(update)) {
+        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
+                    __func__, e->neighbor, update_value);
         reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
         reqNumEdgeMap[req]--;
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 3a6911c1bf..27c7ad4fea 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -64,10 +64,25 @@ WLEngine::RespPort::getAddrRanges() const
     return owner->getAddrRanges();
 }
 
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
 bool
 WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
 {
-    return owner->handleIncomingUpdate(pkt);
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
 }
 
 Tick
@@ -107,7 +122,6 @@ WLEngine::processNextReadEvent()
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
 
-    // FIXME: else logic is wrong
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
@@ -118,7 +132,11 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                if (updateQueue.size() == updateQueueSize - 1) {
+                    respPort.checkRetryReq();
+                }
+
             }
         }
     } else {
@@ -131,8 +149,10 @@ WLEngine::processNextReadEvent()
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
-        // TODO: Add a stat to count the number of coalescions
+        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        if (updateQueue.size() == updateQueueSize - 1) {
+            respPort.checkRetryReq();
+        }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -180,6 +200,7 @@ WLEngine::processNextReduceEvent()
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
     }
+    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index a8dff32d44..476c9be932 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,6 +48,7 @@ class WLEngine : public BaseReduceEngine
     {
       private:
         WLEngine* owner;
+        bool needSendRetryReq;
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
@@ -55,6 +56,8 @@ class WLEngine : public BaseReduceEngine
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        void checkRetryReq();
+
       protected:
         virtual bool recvTimingReq(PacketPtr pkt);
         virtual Tick recvAtomic(PacketPtr pkt);

From f43564614cbf10d78bb23122e2242e657776ebef Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 5 Apr 2022 09:20:52 -0700
Subject: [PATCH 082/287] Debugging coalesce engine deadlock.

---
 src/accl/graph/base/data_structs.hh    |   8 +-
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   2 +
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |  71 ++++---
 5 files changed, 254 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index b51a9f0781..dacb74e38c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    std::string to_string()
-    {
-        return csprintf("Edge{weight: %lu, neighbor: %lu}", weight, neighbor);
-    }
+    // std::string to_string()
+    // {
+    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    // }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8d97fffd20..d7fa806fff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -41,6 +41,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
@@ -77,17 +79,21 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        DPRINTF(MPU, "%s: Read request with addr: %lu hit in the cache.\n"
-                        , __func__, addr);
         // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
         addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(cacheBlocks[block_index].items[wl_offset]);
+        worklistResponseQueue.push_back(
+            cacheBlocks[block_index].items[wl_offset]);
+        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset,
+            worklistResponseQueue.size(),
+            cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
 
@@ -101,50 +107,72 @@ CoalesceEngine::recvReadAddr(Addr addr)
         return true;
     } else {
         // miss
+        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+                        "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
+                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                            "Rejecting request.\n", __func__);
                 return false;
             } else {
+                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                                    "Rejecting request.\n",
+                                    __func__, block_index);
                         return false;
                     }
-                    // MSHR available but conflict
-                    DPRINTF(MPU, "%s: Read request with addr: %lu missed with "
-                                "conflict. Making a request for "
-                                "aligned_addr: %lu.\n",
-                                __func__, addr, aligned_addr);
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
                     // TODO: Set valid to false every deallocation and
-                    // assert valid == false here.
+                    assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
                     assert(
                         outstandingMemReqQueue.size() <=
                         outstandingMemReqQueueSize);
+                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                                "allocate a cache line for it.\n",
+                                __func__, addr);
                     if (outstandingMemReqQueue.size() ==
                         outstandingMemReqQueueSize) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
+                                    "(outstandingMemReqQueue.size: %u). "
+                                    "Rejecting  request.\n", __func__,
+                                    outstandingMemReqQueue.size());
                         return false;
                     }
-                    DPRINTF(MPU, "%s: Read request with addr: "
-                                "%lu missed with no conflict. "
-                                "Making a request for aligned_addr: %lu.\n"
-                                , __func__, addr, aligned_addr);
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].takenMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
+                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                                "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
+                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                                "line[%d].\n", __func__, addr, block_index);
                     // TODO: Parameterize 64 to memory atom size
                     PacketPtr pkt = createReadPacket(aligned_addr, 64);
+                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                                " req addr (aligned_addr) = %lu, size = 64.\n",
+                                __func__, addr, aligned_addr);
                     outstandingMemReqQueue.push_back(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
+                                "outstandingMemReqQueue.size = %d", __func__,
+                                outstandingMemReqQueue.size());
 
                     stats.numVertexBlockReads++;
 
@@ -156,14 +184,24 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 }
             }
         } else {
+            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+                        "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                            "Rejecting request.\n",
+                            __func__, block_index);
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
+                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                            "with Addr: %lu.\n", __func__, addr,
+                            cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
             MSHRMap[block_index].push_back(addr);
+            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                            "line[%d].\n", __func__, addr, block_index);
             return true;
         }
     }
@@ -176,9 +214,24 @@ CoalesceEngine::processNextMemReqEvent()
 
     if (!memPortBlocked()) {
         sendMemReq(pkt);
+        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
+                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
         outstandingMemReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
+                    "outstandingMemReqQueue.size = %u.\n", __func__,
+                    outstandingMemReqQueue.size());
     }
 
+    if ((alarmRequested) &&
+        (outstandingMemReqQueue.size() <
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        schedule(nextApplyAndCommitEvent, nextCycle());
+        DPRINTF(MPU, "%s: There is an alarm request for "
+        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
+        "nextApplyAndCommitEvent.\n", __func__);
+    }
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
         schedule(nextMemReqEvent, nextCycle());
@@ -192,9 +245,14 @@ CoalesceEngine::processNextRespondEvent()
     WorkListItem worklist_response = worklistResponseQueue.front();
 
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
 
     addrResponseQueue.pop_front();
     worklistResponseQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
+                "worklistResponseQueue.size = %d.\n", __func__,
+                worklistResponseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!worklistResponseQueue.empty()) &&
@@ -208,15 +266,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    int block_index = addr % 256; // TODO: After parameterizing the cache size
-                                  // this 256 number should change to the cache
-                                  // size parameter.
+    // TODO: After parameterizing the cache size
+    // this 256 number should change to the cache
+    // size parameter.
+    int block_index = (addr / 64) % 256;
 
+    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+                __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
@@ -224,6 +287,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     for (int i = 0; i < 4; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
+        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+                block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
 
@@ -231,29 +296,42 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr alligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = (miss_addr / 64) * 64;
 
-        if (alligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - alligned_miss_addr) / 16;
+        if (aligned_miss_addr == addr) {
+            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+                        "be serviced with the received packet.\n",
+                        __func__, miss_addr, block_index);
             addrResponseQueue.push_back(miss_addr);
             worklistResponseQueue.push_back(
                 cacheBlocks[block_index].items[wl_offset]);
+            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    , __func__, block_index, wl_offset,
+                    worklistResponseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
+            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+                        "removal.\n", __func__, i, block_index);
         }
     }
     // TODO: We Can use taken instead of this
     for (int i = 0; i < servicedIndices.size(); i++) {
+        Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
+        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+                    __func__, print_addr);
     }
 
     if (MSHRMap[block_index].empty()) {
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
+        // TODO: I think this is unnecessary.
         cacheBlocks[block_index].hasConflict = true;
     }
 
@@ -286,27 +364,33 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     Addr aligned_addr = (addr / 64) * 64;
-    int block_index = aligned_addr % 256;
+    int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
-    DPRINTF(MPU, "%s: Recieved a WorkList write. addr: %lu, wl: %s.\n",
-                                    __func__, addr, wl.to_string());
-    DPRINTF(MPU, "%s: aligned_addr: %lu, block_index: %d, wl_offset: %d, "
-            "takenMask: %u.\n", __func__, aligned_addr,
-            block_index, wl_offset, cacheBlocks[block_index].takenMask);
+
+    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+                __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+                cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     // && (cacheBlocks[block_index].hasConflict)
     if ((cacheBlocks[block_index].takenMask == 0)) {
+        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+                    " It does not have any taken items anymore.\n",
+                    __func__, block_index);
         evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
+        (!evictQueue.empty())&&
+        ((!alarmRequested) && (spaceRequested == 0))) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -315,90 +399,163 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
+    assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
     // TODO: parameterize 64 to memory atom size
     uint8_t* wl_data;
     uint8_t data[64];
 
+    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
+                __func__, block_index);
+    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
+                "then commited.\n", __func__, block_index);
+
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+
+    } else if ((!cacheBlocks[block_index].hasConflict) &&
+            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else {
+        alarmRequested = true;
+        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
+        __func__, spaceRequested);
+        return;
+    }
+
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
             cacheBlocks[block_index].items[i].prop,
             cacheBlocks[block_index].items[i].tempProp);
+        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                    block_index, i,
+                    cacheBlocks[block_index].items[i].to_string());
         if (old_prop != cacheBlocks[block_index].items[i].prop) {
             changedMask |= (1 << i);
+            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                        __func__, block_index, i);
         }
-        DPRINTF(MPU, "%s: Writing WorkListItem[%lu[%d]] to memory. "
-                    "WLItem: %s.\n", __func__, cacheBlocks[block_index].addr,
-                    i, cacheBlocks[block_index].items[i].to_string());
         wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
         std::memcpy(data + (i * sizeof(WorkListItem)),
                     wl_data, sizeof(WorkListItem));
     }
 
     if (changedMask) {
+        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                    , __func__, block_index);
         assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
         PacketPtr write_pkt = createWritePacket(
             cacheBlocks[block_index].addr, 64, data);
-
-        if ((cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)){
+        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                    __func__, write_pkt->getAddr());
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(
+                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
+            );
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
             Addr miss_addr = MSHRMap[block_index][0];
-            // TODO: Make sure this trick works;
-            Addr alligned_miss_addr = (miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(alligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
             outstandingMemReqQueue.push_back(write_pkt);
             outstandingMemReqQueue.push_back(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue. "
+                        "outstandingMemReqQueue.size = %u.\n", __func__,
+                        outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+        } else {
+            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
             outstandingMemReqQueue.push_back(write_pkt);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
+                        "%u.\n", __func__, outstandingMemReqQueue.size());
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 0);
             }
             if ((changedMask & (2)) == 2) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 1);
             }
             if ((changedMask & (4)) == 4) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 2);
             }
             if ((changedMask & (8)) == 8) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                            __func__, block_index, 3);
             }
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
             evictQueue.pop_front();
-            DPRINTF(MPU, "%s: evictQueue.size: %u.\n",
-                __func__, evictQueue.size());
-        } else {
-            DPRINTF(MPU, "%s: Commit failed due to full reqQueue.\n" ,
-                __func__);
+            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
+        cacheBlocks[block_index].takenMask = 0;
+        cacheBlocks[block_index].allocated = false;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = false;
+        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                    "backs are necessary. Deallocated cache line[%d].\n",
+                    __func__, block_index, block_index);
         evictQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
     }
 
     if ((!nextMemReqEvent.scheduled()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 5c4e752cbf..902a960301 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -74,6 +74,8 @@ class CoalesceEngine : public BaseReadEngine
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
     int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     std::deque<Addr> addrResponseQueue;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70d6242f5b..c9ed781d79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -212,7 +212,7 @@ PushEngine::processNextPushEvent()
             __func__, pkt->getAddr(), offset);
 
     Edge* e = (Edge*) (data + offset);
-    DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27c7ad4fea..ea45cae652 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -68,7 +68,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a reqRetry.\n", __func__);
+        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -121,43 +121,49 @@ WLEngine::processNextReadEvent()
     PacketPtr update = updateQueue.front();
     Addr update_addr = update->getAddr();
     uint32_t update_value = update->getLE<uint32_t>();
+    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+                "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
+        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+                    __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
             if (coalesceEngine->recvReadAddr(update_addr)) {
-                DPRINTF(MPU, "%s: Received an update and it's not been pulled in. "
-                                "update_addr: %lu, update_value: %u.\n",
-                                __func__, update_addr, update_value);
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: onTheFlyUpdateMap[%lu] = %d.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
+                            update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: 0: updateQueue.size: %d.\n", __func__, updateQueue.size());
+                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                            ". updateQueue.size = %u.\n",
+                            __func__, updateQueue.size());
                 if (updateQueue.size() == updateQueueSize - 1) {
                     respPort.checkRetryReq();
                 }
-
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Hitting in the onTheFlyUpdateMap."
-                            "update_addr: %lu, update_value: %u, old_value: %u.\n",
-                            __func__, update_addr, update_value,
-                            onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
+        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
+                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: 1: updateQueue.size: %d.\n", __func__, updateQueue.size());
+        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
         if (updateQueue.size() == updateQueueSize - 1) {
             respPort.checkRetryReq();
         }
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) &&
-        (!updateQueue.empty())) {
+    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -166,9 +172,14 @@ void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+
     addrWorkListMap[addr] = wl;
-    // TODO: Add checks to see if scheduling is necessary or correct.
-    if ((!nextReduceEvent.scheduled()) && (!addrWorkListMap.empty())) {
+    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
+                __func__, addr, wl.to_string());
+
+    assert(!addrWorkListMap.empty());
+    if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
 }
@@ -182,25 +193,32 @@ WLEngine::processNextReduceEvent()
     std::vector<Addr> servicedAddresses;
     while (it != addrWorkListMap.end()) {
         Addr addr = it->first;
-        WorkListItem wl = it->second;
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: updating WorkList[%lu] with the current temp_prop: "
-                    "%d, with new update: %d.\n", __func__, addr, wl.tempProp,
-                    onTheFlyUpdateMap[addr]);
+        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
+                    "addrWorkListMap[%lu] = %s.\n", __func__,
+                                addr, onTheFlyUpdateMap[addr],
+                                addr, addrWorkListMap[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        wl.tempProp = std::min(update_value, wl.tempProp);
+        addrWorkListMap[addr].tempProp =
+                    std::min(update_value, addrWorkListMap[addr].tempProp);
+        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+                    __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, wl);
+        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         servicedAddresses.push_back(addr);
+        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
+                    __func__, addr);
         it++;
     }
 
     addrWorkListMap.clear();
     for (int i = 0; i < servicedAddresses.size(); i++) {
         onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, servicedAddresses[i]);
     }
-    DPRINTF(MPU, "%s: onTheFlyUpdateMap.size(): %u, servicedAddresses.size(): %u.\n", __func__, onTheFlyUpdateMap.size(), servicedAddresses.size());
 }
 
 bool
@@ -212,9 +230,10 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.push_back(pkt);
-
+    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+                                        ". updateQueue.size = %u.\n",
+                                        __func__, updateQueue.size());
     assert(!updateQueue.empty());
-    DPRINTF(MPU, "%s: updateQueue.size: %d.\n", __func__, updateQueue.size());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }

From 8195339d419b32284f92f4c14395efc58a245604 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 7 Apr 2022 15:06:58 -0700
Subject: [PATCH 083/287] Restructing inheritance and fixiing inf queue.

---
 configs/accl/sega.py                    |   6 +-
 src/accl/graph/TODO.md                  |   1 +
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |  83 ++++++++
 src/accl/graph/base/base_read_engine.hh |  18 +-
 src/accl/graph/base/data_structs.hh     |   8 +-
 src/accl/graph/sega/CoalesceEngine.py   |   3 +-
 src/accl/graph/sega/PushEngine.py       |   1 -
 src/accl/graph/sega/coalesce_engine.cc  | 254 ++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +-
 src/accl/graph/sega/push_engine.cc      | 142 ++++++-------
 src/accl/graph/sega/push_engine.hh      |  55 +++--
 src/accl/graph/sega/wl_engine.cc        |  10 +-
 src/accl/graph/sega/wl_engine.hh        |   2 +-
 14 files changed, 348 insertions(+), 254 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0c7766fe0..8e24280366 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,9 +2,9 @@
 from m5.objects import *
 
 class MPU(SubSystem):
-    def __init__(self):
+    def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0x80000000,
+        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
                                     push_req_queue_size=16,
                                     mem_resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
@@ -58,7 +58,7 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.mpu = MPU()
+        self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
             vertex_binary="facebook/graph_binaries/vertices",
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index d5effbeb96..a0e2cefeff 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -6,3 +6,4 @@ and memory atom size in the coalesce engine
 * look at all the simobjects and come up with a general architecture. Make
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
+* get rid of maps with RequestPtr as keys
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 84c53465b9..3ddab2d3c4 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -37,3 +37,6 @@ class BaseReadEngine(ClockedObject):
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
+
+    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
+                                    "which memory requests are queued.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index a32237db35..e3b588cfc6 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/base/base_read_engine.hh"
 
+#include "debug/MPU.hh"
 namespace gem5
 {
 
@@ -35,6 +36,10 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    alarmRequested(false),
+    spaceRequested(0),
+    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this))
 {}
 
@@ -83,6 +88,31 @@ BaseReadEngine::MemPort::recvReqRetry()
     }
 }
 
+void
+BaseReadEngine::processNextMemReqEvent()
+{
+    if (memPort.blocked()) {
+        return;
+    }
+
+    // TODO: Maybe add a DPRINTF here.
+    PacketPtr pkt = outstandingMemReqQueue.front();
+    memPort.sendPacket(pkt);
+    outstandingMemReqQueue.pop_front();
+
+    if (alarmRequested &&
+        (outstandingMemReqQueue.size() <=
+        (outstandingMemReqQueueSize - spaceRequested))) {
+        alarmRequested = false;
+        spaceRequested = 0;
+        respondToAlarm();
+    }
+
+    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 PacketPtr
 BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 {
@@ -98,4 +128,57 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
+PacketPtr
+BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+bool
+BaseReadEngine::memReqQueueHasSpace(int space)
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (
+        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        );
+}
+
+bool
+BaseReadEngine::memReqQueueFull()
+{
+    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+}
+
+void
+BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+{
+    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    outstandingMemReqQueue.push_back(pkt);
+
+    assert(!outstandingMemReqQueue.empty());
+    if (!nextMemReqEvent.scheduled()) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
+void
+BaseReadEngine::requestAlarm(int space) {
+    panic_if((alarmRequested == true) || (spaceRequested != 0),
+            "You should not request another alarm without the first one being"
+            "responded to.\n");
+    alarmRequested = true;
+    spaceRequested = space;
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index e21aaa01d2..bec922beef 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,16 +68,30 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int outstandingMemReqQueueSize;
+    bool alarmRequested;
+    int spaceRequested;
+    std::deque<PacketPtr> outstandingMemReqQueue;
+
+    EventFunctionWrapper nextMemReqEvent;
+    void processNextMemReqEvent();
+
   protected:
     const RequestorID _requestorId;
 
-    bool memPortBlocked() { return memPort.blocked(); }
-    void sendMemReq(PacketPtr pkt) { memPort.sendPacket(pkt); }
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    bool memReqQueueHasSpace(int space);
+    bool memReqQueueFull();
+    void enqueueMemReq(PacketPtr pkt);
+    bool pendingAlarm() { return alarmRequested; }
+    void requestAlarm(int space);
+
+    virtual void respondToAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
     PARAMS(BaseReadEngine);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index dacb74e38c..28a503528f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -70,10 +70,10 @@ struct __attribute__ ((packed)) Edge
     uint16_t weight : 16;
     uint64_t neighbor : 48;
 
-    // std::string to_string()
-    // {
-    //     return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
-    // }
+    std::string to_string()
+    {
+        return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
+    }
 
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 0330da7576..bec7e3d233 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -33,8 +33,7 @@ class CoalesceEngine(BaseReadEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
-    
+
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
-    outstanding_mem_req_queue_size = Param.Int(20, "")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 129d9454c7..645bc5f4ea 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -36,5 +36,4 @@ class PushEngine(BaseReadEngine):
 
     req_port  = RequestPort("Port to send updates to the outside")
     base_edge_addr = Param.Addr("")
-    mem_resp_queue_size = Param.Int(0, "")
     push_req_queue_size = Param.Int(0, "")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7fa806fff..015629245b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -40,10 +40,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
@@ -85,14 +81,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        // TODO: Make addrQueue and wlQueue into one std::pair<Addr, WL>
-        addrResponseQueue.push_back(addr);
-        worklistResponseQueue.push_back(
-            cacheBlocks[block_index].items[wl_offset]);
+        responseQueue.push_back(std::make_tuple(addr,
+                    cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to worklistResponseQueue. worklistResponseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            worklistResponseQueue.size(),
+            "to responseQueue. responseQueue.size = %d.\n",
+            __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
@@ -100,7 +93,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
         stats.readHits++;
         stats.numVertexReads++;
 
-        assert(!worklistResponseQueue.empty() && !addrResponseQueue.empty());
+        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -136,21 +129,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     return true;
                 } else {
-                    // TODO: Set valid to false every deallocation and
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    assert(
-                        outstandingMemReqQueue.size() <=
-                        outstandingMemReqQueueSize);
+                    //TODO: Fix this to work with new inheritance.
+                    // assert(
+                    //     outstandingMemReqQueue.size() <=
+                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (outstandingMemReqQueue.size() ==
-                        outstandingMemReqQueueSize) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue "
-                                    "(outstandingMemReqQueue.size: %u). "
-                                    "Rejecting  request.\n", __func__,
-                                    outstandingMemReqQueue.size());
+                    if (memReqQueueFull()) {
+                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                                    "Rejecting  request.\n", __func__);
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -169,17 +159,10 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = 64.\n",
                                 __func__, addr, aligned_addr);
-                    outstandingMemReqQueue.push_back(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue. "
-                                "outstandingMemReqQueue.size = %d", __func__,
-                                outstandingMemReqQueue.size());
-
+                    enqueueMemReq(pkt);
+                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                                                                    __func__);
                     stats.numVertexBlockReads++;
-
-                    assert(!outstandingMemReqQueue.empty());
-                    if (!nextMemReqEvent.scheduled()) {
-                        schedule(nextMemReqEvent, nextCycle());
-                    }
                     return true;
                 }
             }
@@ -207,65 +190,41 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
-void
-CoalesceEngine::processNextMemReqEvent()
-{
-    PacketPtr pkt = outstandingMemReqQueue.front();
-
-    if (!memPortBlocked()) {
-        sendMemReq(pkt);
-        DPRINTF(MPU, "%s: Sent a memory request to Addr: %lu, MemCmd: %s.\n",
-                __func__, pkt->getAddr(), pkt->isRead() ? "Read" : "Write");
-        outstandingMemReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped a packet from outstandingMemReqQueue. "
-                    "outstandingMemReqQueue.size = %u.\n", __func__,
-                    outstandingMemReqQueue.size());
-    }
-
-    if ((alarmRequested) &&
-        (outstandingMemReqQueue.size() <
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        schedule(nextApplyAndCommitEvent, nextCycle());
-        DPRINTF(MPU, "%s: There is an alarm request for "
-        "nextApplyAndCommitEvent. Reset alarm parameters and scheduled "
-        "nextApplyAndCommitEvent.\n", __func__);
-    }
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
 void
 CoalesceEngine::processNextRespondEvent()
 {
-    Addr addr_response = addrResponseQueue.front();
-    WorkListItem worklist_response = worklistResponseQueue.front();
+    Addr addr_response;
+    WorkListItem worklist_response;
 
+    std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
-    addrResponseQueue.pop_front();
-    worklistResponseQueue.pop_front();
+    responseQueue.pop_front();
     DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
                 "worklistResponseQueue.size = %d.\n", __func__,
-                worklistResponseQueue.size());
+                responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 }
 
+void
+CoalesceEngine::respondToAlarm()
+{
+    assert(!nextApplyAndCommitEvent.scheduled());
+    schedule(nextApplyAndCommitEvent, nextCycle());
+}
+
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
+        delete pkt;
         DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
@@ -291,6 +250,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
+    delete pkt;
 
     int bias = 0;
     std::vector<int> servicedIndices;
@@ -303,13 +263,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
-            addrResponseQueue.push_back(miss_addr);
-            worklistResponseQueue.push_back(
-                cacheBlocks[block_index].items[wl_offset]);
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
-                    "worklistResponseQueue. worklistResponseQueue.size = %u.\n"
+                    "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
-                    worklistResponseQueue.size());
+                    responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
@@ -336,8 +295,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if ((!nextRespondEvent.scheduled()) &&
-        (!worklistResponseQueue.empty()) &&
-        (!addrResponseQueue.empty())) {
+        (!responseQueue.empty())) {
         schedule(nextRespondEvent, nextCycle());
     }
 
@@ -363,7 +321,8 @@ CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    Addr aligned_addr = (addr / 64) * 64;
+    // TODO: Parameterize all the numbers here.
+    Addr aligned_addr = std::floor(addr / 64) * 64;
     int block_index = (aligned_addr / 64) % 256;
     int wl_offset = (addr - aligned_addr) / 16;
 
@@ -371,6 +330,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
             (1 << wl_offset));
+
+    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+        cacheBlocks[block_index].hasChange = true;
+    }
+
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
@@ -378,7 +342,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    // && (cacheBlocks[block_index].hasConflict)
+    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
+    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -389,8 +354,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())&&
-        ((!alarmRequested) && (spaceRequested == 0))) {
+        (!evictQueue.empty()) &&
+        (pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
@@ -399,36 +364,45 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {
-    assert((!alarmRequested) && (spaceRequested == 0));
+    // FIXME: Refactor the line below to work with the new inheritance.
+    // assert((!alarmRequested) && (spaceRequested == 0));
     int block_index = evictQueue.front();
     uint8_t changedMask = 0;
-    // TODO: parameterize 64 to memory atom size
-    uint8_t* wl_data;
-    uint8_t data[64];
 
     DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
 
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1)) {
+    if ((cacheBlocks[block_index].hasChange)&&
+        (cacheBlocks[block_index].hasConflict) &&
+        (memReqQueueHasSpace(2))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
-
-    } else if ((!cacheBlocks[block_index].hasConflict) &&
-            (outstandingMemReqQueue.size() < outstandingMemReqQueueSize)) {
+    } else if ((cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
+        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                    __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (cacheBlocks[block_index].hasConflict) &&
+                (memReqQueueHasSpace(1))) {
         DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
                     __func__, block_index);
+    } else if ((!cacheBlocks[block_index].hasChange) &&
+                (!cacheBlocks[block_index].hasConflict)) {
+        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                    __func__, block_index);
     } else {
-        alarmRequested = true;
-        spaceRequested = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+        requestAlarm(spaceNeeded);
         DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when space = %d.\n",
-        __func__, spaceRequested);
+        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+        __func__, spaceNeeded);
         return;
     }
 
+    // Reducing between tempProp and prop for each item in the cache line.
     for (int i = 0; i < 4; i++) {
         uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
         cacheBlocks[block_index].items[i].prop = std::min(
@@ -442,23 +416,18 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                         __func__, block_index, i);
         }
-        wl_data = (uint8_t*) (cacheBlocks[block_index].items + i);
-        std::memcpy(data + (i * sizeof(WorkListItem)),
-                    wl_data, sizeof(WorkListItem));
     }
 
-    if (changedMask) {
+    if (cacheBlocks[block_index].hasChange) {
         DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                     , __func__, block_index);
-        assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+        // TODO: Parameterize this 64 to memory atom size
         PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64, data);
+            cacheBlocks[block_index].addr, 64,
+            (uint8_t*) cacheBlocks[block_index].items);
         DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
                     __func__, write_pkt->getAddr());
         if (cacheBlocks[block_index].hasConflict) {
-            assert(
-                outstandingMemReqQueue.size() < outstandingMemReqQueueSize - 1
-            );
             DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                         "enough space in outstandingMemReqQueue for the write "
                         "back packet and its subsequent read packet.\n",
@@ -467,18 +436,19 @@ CoalesceEngine::processNextApplyAndCommitEvent()
             DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                         " Addr: %lu.\n", __func__, block_index, miss_addr);
             // TODO: parameterize 64
-            Addr aligned_miss_addr = (miss_addr / 64) * 64;
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
             PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
             DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
                         " req addr (aligned_addr) = %lu, size = 64.\n",
                         __func__, miss_addr, aligned_miss_addr);
-            outstandingMemReqQueue.push_back(write_pkt);
-            outstandingMemReqQueue.push_back(read_pkt);
+
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
+            enqueueMemReq(read_pkt);
             DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                         "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue. "
-                        "outstandingMemReqQueue.size = %u.\n", __func__,
-                        outstandingMemReqQueue.size());
+                        " to outstandingMemReqQueue.\n" , __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -500,22 +470,25 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+            // TODO: This should be improved
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = true;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = true;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         } else {
-            assert(outstandingMemReqQueue.size() < outstandingMemReqQueueSize);
             DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
                     "enough space in outstandingMemReqQueue for the write back"
                     " packet.\n", __func__, block_index);
-            outstandingMemReqQueue.push_back(write_pkt);
+            enqueueMemReq(write_pkt);
+            stats.numVertexBlockWrites++;
             DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue. oustandingMemReqQueue.size = "
-                        "%u.\n", __func__, outstandingMemReqQueue.size());
+                        "outstandingMemReqQueue.\n", __func__);
+
             // TODO: This should be improved
             if ((changedMask & (1)) == 1) {
                 peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
@@ -537,33 +510,58 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                             __func__, block_index, 3);
             }
+
+            // Since allocated is false, does not matter what the address is.
             cacheBlocks[block_index].takenMask = 0;
             cacheBlocks[block_index].allocated = false;
             cacheBlocks[block_index].valid = false;
             cacheBlocks[block_index].hasConflict = false;
-            evictQueue.pop_front();
+            cacheBlocks[block_index].hasChange = false;
             DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
                         " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        cacheBlocks[block_index].takenMask = 0;
-        cacheBlocks[block_index].allocated = false;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = false;
         DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary. Deallocated cache line[%d].\n",
+                    "backs are necessary.\n",
                     __func__, block_index, block_index);
-        evictQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
-    }
+        if (cacheBlocks[block_index].hasConflict) {
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write "
+                        "back packet and its subsequent read packet.\n",
+                        __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+            // TODO: parameterize 64
+            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = 64.\n",
+                        __func__, miss_addr, aligned_miss_addr);
+            enqueueMemReq(read_pkt);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                        "deallocating the line.\n", __func__, block_index);
 
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
-        stats.numVertexBlockWrites++;
-        schedule(nextMemReqEvent, nextCycle());
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
+        }
     }
 
+    evictQueue.pop_front();
+    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                        " = %u.\n", __func__, evictQueue.size());
+
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 902a960301..6a8aadcbae 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -53,6 +53,7 @@ class CoalesceEngine : public BaseReadEngine
         bool allocated;
         bool valid;
         bool hasConflict;
+        bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block():
@@ -60,7 +61,8 @@ class CoalesceEngine : public BaseReadEngine
           takenMask(0),
           allocated(false),
           valid(false),
-          hasConflict(false)
+          hasConflict(false),
+          hasChange(false)
         {}
     };
 
@@ -73,13 +75,7 @@ class CoalesceEngine : public BaseReadEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
 
-    int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
-
-    std::deque<Addr> addrResponseQueue;
-    std::deque<WorkListItem> worklistResponseQueue;
+    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     std::deque<int> evictQueue;
 
@@ -88,9 +84,6 @@ class CoalesceEngine : public BaseReadEngine
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
@@ -115,6 +108,7 @@ class CoalesceEngine : public BaseReadEngine
     CoalesceStats stats;
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9ed781d79..86418ac76e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -39,10 +39,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    memRespQueueSize(params.mem_resp_queue_size),
-    onTheFlyReadReqs(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextReadEvent([this] { processNextReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name())
 {}
 
@@ -66,12 +63,13 @@ PushEngine::startup()
     *tempPtr = 0;
 
     // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
-    PacketPtr first_update = createUpdatePacket(0, 4, (uint32_t) 0);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
-    sendPushUpdate(first_update);
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
 }
 
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -108,19 +106,21 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    assert(pushReqQueue.size() <= pushReqQueueSize);
+    assert((pushReqQueueSize == 0) ||
+        (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
         return false;
     }
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t update_value = wl.prop;
-    pushReqQueue.push_back(
-        std::make_pair(std::make_pair(start_addr, end_addr), update_value));
+    uint32_t value = wl.prop;
 
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!pushReqQueue.empty())) {
+    // TODO: parameterize 64 to memory atom size
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+
+    assert(!pushReqQueue.empty());
+    if (!nextAddrGenEvent.scheduled()) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -129,65 +129,44 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::processNextAddrGenEvent()
 {
-    Addr start_addr, end_addr;
-    uint32_t update_value;
 
-    std::pair<std::pair<Addr, Addr>, uint32_t> front = pushReqQueue.front();
-    std::tie(start_addr, end_addr) = front.first;
-    update_value = front.second;
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    Addr req_addr = (start_addr / 64) * 64;
-    Addr req_offset = start_addr % 64;
-    int num_edges = 0;
+    PushPacketInfoGen curr_info = pushReqQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
 
-    if (end_addr > req_addr + 64) {
-        num_edges = (req_addr + 64 - start_addr) / sizeof(Edge);
-    } else {
-        num_edges = (end_addr - start_addr) / sizeof(Edge);
-    }
-    PacketPtr pkt = createReadPacket(req_addr, 64);
-    reqOffsetMap[pkt->req] = req_offset;
+    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = update_value;
-    pendingReadReqs.push_back(pkt);
+    reqValueMap[pkt->req] = curr_info.value();
 
-    pushReqQueue.pop_front();
+    enqueueMemReq(pkt);
 
-    if (req_addr + 64 < end_addr) {
-        pushReqQueue.push_front(
-        std::make_pair(std::make_pair(req_addr + 64, end_addr), update_value)
-        );
+    if (curr_info.done()) {
+        pushReqQueue.pop_front();
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
+        requestAlarm(1);
+        return;
     }
 
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
+        schedule(nextAddrGenEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextReadEvent()
+PushEngine::respondToAlarm()
 {
-    if (((memRespQueue.size() + onTheFlyReadReqs) <= memRespQueueSize) &&
-        (!memPortBlocked())) {
-        PacketPtr pkt = pendingReadReqs.front();
-        sendMemReq(pkt);
-        onTheFlyReadReqs++;
-        pendingReadReqs.pop_front();
-    }
-
-    if ((!nextReadEvent.scheduled()) && (!pendingReadReqs.empty())) {
-        schedule(nextReadEvent, nextCycle());
-    }
+    assert(!nextAddrGenEvent.scheduled());
+    schedule(nextAddrGenEvent, nextCycle());
 }
 
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
-    onTheFlyReadReqs--;
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
@@ -201,39 +180,42 @@ void
 PushEngine::processNextPushEvent()
 {
     PacketPtr pkt = memRespQueue.front();
-    RequestPtr req = pkt->req;
-    uint8_t *data = pkt->getPtr<uint8_t>();
+    uint8_t* data = pkt->getPtr<uint8_t>();
 
-    Addr offset = reqOffsetMap[req];
-    uint32_t value = reqValueMap[req];
+    Addr offset = reqOffsetMap[pkt->req];
+    assert(offset < 64);
+    uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
-    Edge* e = (Edge*) (data + offset);
-    // DPRINTF(MPU, "%s: Read %s\n", __func__, e->to_string());
+    Edge* curr_edge = (Edge*) (data + offset);
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, e->neighbor, update_value);
+            __func__, curr_edge->neighbor, update_value);
 
-    PacketPtr update = createUpdatePacket(e->neighbor,
-                        sizeof(uint32_t), update_value);
+    PacketPtr update = createUpdatePacket<uint32_t>(
+                            curr_edge->neighbor, update_value);
 
-    if (sendPushUpdate(update)) {
+    if (!reqPort.blocked()) {
         DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                    __func__, e->neighbor, update_value);
-        reqOffsetMap[req] = reqOffsetMap[req] + sizeof(Edge);
-        reqNumEdgeMap[req]--;
-    }
-
-    if (reqNumEdgeMap[req] == 0) {
+                                __func__, curr_edge->neighbor, update_value);
+        reqPort.sendPacket(update);
+        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
+        assert(reqOffsetMap[pkt->req] <= 64);
+        reqNumEdgeMap[pkt->req]--;
+        assert(reqNumEdgeMap[pkt->req] >= 0);
+    }
+
+    if (reqNumEdgeMap[pkt->req] == 0) {
+        reqOffsetMap.erase(pkt->req);
+        reqNumEdgeMap.erase(pkt->req);
+        reqValueMap.erase(pkt->req);
+        delete pkt;
         memRespQueue.pop_front();
-        reqOffsetMap.erase(req);
-        reqNumEdgeMap.erase(req);
-        reqValueMap.erase(req);
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
@@ -241,11 +223,11 @@ PushEngine::processNextPushEvent()
     }
 }
 
-PacketPtr
-// PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint8_t* data)
-PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) _requestorId) << 2);
@@ -255,19 +237,9 @@ PushEngine::createUpdatePacket(Addr addr, unsigned int size, uint32_t value)
 
     pkt->allocate();
     // pkt->setData(data);
-    pkt->setLE<uint32_t>(value);
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
-bool
-PushEngine::sendPushUpdate(PacketPtr pkt)
-{
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(pkt);
-        return true;
-    }
-    return false;
-}
-
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a539079ede..2aba0ca008 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -39,6 +39,42 @@ namespace gem5
 class PushEngine : public BaseReadEngine
 {
   private:
+    class PushPacketInfoGen {
+      private:
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+        uint32_t _value;
+
+      public:
+        PushPacketInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _value(value)
+        {}
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (_start + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+            _start = aligned_addr + _atom;
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        uint32_t value() { return _value; }
+        bool done() { return (_start >= _end); }
+    };
+
     class ReqPort : public RequestPort
     {
       private:
@@ -64,37 +100,30 @@ class PushEngine : public BaseReadEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    std::deque<std::pair<std::pair<Addr, Addr>, uint32_t>> pushReqQueue;
+    std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // TODO: Possibility of infinite queueing
-    std::deque<PacketPtr> pendingReadReqs;
-
-    int memRespQueueSize;
-    int onTheFlyReadReqs;
+    // Since the push engine can process incoming packets faster than
+    // memory can send those packets, the size of this queue will
+    // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
     virtual void startup();
 
-    // PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint8_t* data);
-    PacketPtr createUpdatePacket(Addr addr, unsigned int size, uint32_t value);
-
-    bool sendPushUpdate(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
     void processNextAddrGenEvent();
 
-    EventFunctionWrapper nextReadEvent;
-    void processNextReadEvent();
-
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
   protected:
+    virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ea45cae652..cca945ce0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -118,9 +118,10 @@ WLEngine::getAddrRanges() const
 void
 WLEngine::processNextReadEvent()
 {
-    PacketPtr update = updateQueue.front();
-    Addr update_addr = update->getAddr();
-    uint32_t update_value = update->getLE<uint32_t>();
+    Addr update_addr;
+    uint32_t update_value;
+    std::tie(update_addr, update_value) = updateQueue.front();
+
     DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
@@ -229,10 +230,11 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.push_back(pkt);
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
+    delete pkt;
     assert(!updateQueue.empty());
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 476c9be932..12df93ee79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -71,7 +71,7 @@ class WLEngine : public BaseReduceEngine
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;
-    std::deque<PacketPtr> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int onTheFlyUpdateMapSize;
     std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;

From 02f7baf9938e2a9b30ea3d9b44140862160b5aba Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Apr 2022 13:13:37 -0700
Subject: [PATCH 084/287] Fixing one scheduling error in events.

---
 configs/accl/sega.py                    |  7 +++----
 src/accl/graph/base/base_read_engine.cc | 12 ++++++++++++
 src/accl/graph/base/base_read_engine.hh |  2 ++
 src/accl/graph/sega/coalesce_engine.cc  |  8 ++++----
 src/accl/graph/sega/push_engine.cc      | 23 +++++++++++++++++------
 src/accl/graph/sega/wl_engine.cc        |  4 ++--
 6 files changed, 40 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e24280366..e45580dd37 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,13 +5,12 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
-                                    mem_resp_queue_size=8)
+                                    push_req_queue_size=16)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                    update_queue_size=16,
-                                    on_the_fly_update_map_size=8)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
         self.interconnect = SystemXBar()
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index e3b588cfc6..1658d85627 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -86,6 +86,8 @@ BaseReadEngine::MemPort::recvReqRetry()
     if (!blocked()) {
         blockedPacket = nullptr;
     }
+
+    owner->wakeUp();
 }
 
 void
@@ -177,8 +179,18 @@ BaseReadEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
+    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
     alarmRequested = true;
     spaceRequested = space;
 }
 
+void
+BaseReadEngine::wakeUp()
+{
+    if ((!nextMemReqEvent.scheduled()) &&
+        (!outstandingMemReqQueue.empty())) {
+        schedule(nextMemReqEvent, nextCycle());
+    }
+}
+
 }
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index bec922beef..5275f86449 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -108,6 +108,8 @@ class BaseReadEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt);
 
+    void wakeUp();
+
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 015629245b..c740597a2c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -202,8 +202,8 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from worklistResponseQueue. "
-                "worklistResponseQueue.size = %d.\n", __func__,
+    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -338,7 +338,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     stats.numVertexWrites++;
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -355,7 +355,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if ((!nextApplyAndCommitEvent.scheduled()) &&
         (!evictQueue.empty()) &&
-        (pendingAlarm())) {
+        (!pendingAlarm())) {
         schedule(nextApplyAndCommitEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 86418ac76e..3c1a98c69a 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -120,7 +120,8 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
 
     assert(!pushReqQueue.empty());
-    if (!nextAddrGenEvent.scheduled()) {
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
     return true;
@@ -133,8 +134,11 @@ PushEngine::processNextAddrGenEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    PushPacketInfoGen curr_info = pushReqQueue.front();
+    PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    DPRINTF(MPU, "%s: Current packet information generated by "
+                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
     PacketPtr pkt = createReadPacket(aligned_addr, 64);
     reqOffsetMap[pkt->req] = offset;
@@ -144,11 +148,17 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
+        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
+        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+                    "pushReqQueue.size() = %u.\n",
+                    __func__, pushReqQueue.size());
     }
 
-    if ((memReqQueueFull()) && (!pushReqQueue.empty())) {
-        requestAlarm(1);
+    if (memReqQueueFull()) {
+        if (!pushReqQueue.empty()) {
+            requestAlarm(1);
+        }
         return;
     }
 
@@ -162,6 +172,7 @@ PushEngine::respondToAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
+    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool
@@ -201,9 +212,9 @@ PushEngine::processNextPushEvent()
                             curr_edge->neighbor, update_value);
 
     if (!reqPort.blocked()) {
-        DPRINTF(MPU, "%s: Send a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
         reqPort.sendPacket(update);
+        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+                                __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= 64);
         reqNumEdgeMap[pkt->req]--;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index cca945ce0a..ad9e93ba60 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -146,7 +146,7 @@ WLEngine::processNextReadEvent()
     } else {
         // TODO: Generalize this to reduce function rather than just min
         DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.", __func__, update_addr,
+                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
@@ -231,7 +231,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the front of updateQueue"
+    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From 4f58d86c6eae6696ffaf735d5999400db0310d46 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 10 Apr 2022 16:42:27 -0700
Subject: [PATCH 085/287] Works!!!!!!

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/TODO.md             | 6 ++++++
 src/accl/graph/sega/push_engine.cc | 8 ++++++++
 src/accl/graph/sega/push_engine.hh | 2 +-
 4 files changed, 17 insertions(+), 3 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e45580dd37..e68097ce74 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,7 +11,7 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
         self.interconnect.cpu_side_ports = self.push_engine.mem_port
@@ -40,7 +40,7 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar()
+        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
         self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
         self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index a0e2cefeff..f6d77d5e22 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -7,3 +7,9 @@ and memory atom size in the coalesce engine
 sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+
+
+Advice from Jason:
+* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
+* if it
+* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 3c1a98c69a..1fced87a43 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -106,6 +106,14 @@ PushEngine::ReqPort::recvReqRetry()
 bool
 PushEngine::recvWLItem(WorkListItem wl)
 {
+    // If there are no outdoing edges, no need to generate and push
+    // updates. Therefore, we only need to return true.
+    if (wl.degree == 0) {
+        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        return true;
+    }
+
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() <= pushReqQueueSize));
     if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2aba0ca008..29d18709ee 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -61,7 +61,7 @@ class PushEngine : public BaseReadEngine
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 
-            if (_end > (_start + _atom)) {
+            if (_end > (aligned_addr + _atom)) {
                 num_items = (_atom - offset) / _step;
             } else {
                 num_items = (_end - _start) / _step;

From b920f152ec5a935d159e0d36904e7dba5079a502 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 09:59:31 -0700
Subject: [PATCH 086/287] Removing SystemXBar from config script. [has-bug]

---
 configs/accl/sega.py | 35 +++++++++++++++++++----------------
 1 file changed, 19 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e68097ce74..dd7623bfea 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -11,10 +11,6 @@ def __init__(self, base_edge_addr):
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
-
-        self.interconnect.cpu_side_ports = self.coalesce_engine.mem_port
-        self.interconnect.cpu_side_ports = self.push_engine.mem_port
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -26,10 +22,15 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.req_port = port
 
-    def getMemPort(self):
-        return self.interconnect.mem_side_ports
-    def setMemPort(self, port):
-        self.interconnect.mem_side_ports = port
+    def getVertexMemPort(self):
+        return self.coalesce_engine.mem_port
+    def setVertexMemPort(self, port):
+        self.coalesce_engine.mem_port = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
@@ -40,15 +41,16 @@ def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         self.edge_mem_ctrl = SimpleMemory(
             range=edge_range, bandwidth="25GB/s",
             latency="30ns", image_file=edge_binary)
-        self.interconnect = SystemXBar(max_routing_table_size=16384)
 
-        self.interconnect.mem_side_ports = self.vertex_mem_ctrl.port
-        self.interconnect.mem_side_ports = self.edge_mem_ctrl.port
+    def getVertexPort(self):
+        return self.vertex_mem_ctrl.port
+    def setVertexPort(self, port):
+        self.vertex_mem_ctrl.port = port
 
-    def getPort(self):
-        return self.interconnect.cpu_side_ports
-    def setPort(self, port):
-        self.interconnect.cpu_side_ports = port
+    def getEdgePort(self):
+        return self.edge_mem_ctrl.port
+    def setEdgePort(self, port):
+        self.edge_mem_ctrl.port = port
 
 class SEGA(System):
     def __init__(self):
@@ -65,7 +67,8 @@ def __init__(self):
             edge_binary="facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
-        self.mpu.setMemPort(self.mem_ctrl.getPort())
+        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
+        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
 system = SEGA()
 root = Root(full_system = False, system = system)

From 58e3b63ea66d9709147566e3e72c882d9bd7216e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 12 Apr 2022 19:59:24 -0700
Subject: [PATCH 087/287] Fixing the bug when deallocating a taken line.

---
 configs/accl/sega.py                    |   4 +-
 src/accl/graph/base/BaseReadEngine.py   |   3 +
 src/accl/graph/base/base_read_engine.cc |   1 +
 src/accl/graph/base/base_read_engine.hh |   2 +
 src/accl/graph/sega/CoalesceEngine.py   |   2 +
 src/accl/graph/sega/coalesce_engine.cc  | 388 +++++++++++++-----------
 6 files changed, 222 insertions(+), 178 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index dd7623bfea..7f4663cc82 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -36,10 +36,10 @@ class MPUMemory(SubSystem):
     def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
         super(MPUMemory, self).__init__()
         self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="25GB/s",
+            range=vertex_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=vertex_binary)
         self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="25GB/s",
+            range=edge_range, bandwidth="19.2GB/s",
             latency="30ns", image_file=edge_binary)
 
     def getVertexPort(self):
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseReadEngine.py
index 3ddab2d3c4..d4ab622fd6 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseReadEngine.py
@@ -40,3 +40,6 @@ class BaseReadEngine(ClockedObject):
 
     outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
                                     "which memory requests are queued.")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 1658d85627..19214a3bd1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,6 +36,7 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 5275f86449..0cab95dbbb 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,6 +68,8 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
+    int peerMemoryAtomSize;
+
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index bec7e3d233..3e5699f552 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseReadEngine):
     peer_push_engine = Param.PushEngine(NULL, "")
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index c740597a2c..41d1fe4953 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -88,7 +88,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
         // TODO: Use a bitset instead of unsigned int for takenMask
+        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,7 +148,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
+                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
+                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -256,7 +264,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = (miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
 
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / 16;
@@ -269,7 +277,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -336,7 +348,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
+    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -373,189 +389,209 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
-
-    if ((cacheBlocks[block_index].hasChange)&&
-        (cacheBlocks[block_index].hasConflict) &&
-        (memReqQueueHasSpace(2))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (cacheBlocks[block_index].hasConflict) &&
-                (memReqQueueHasSpace(1))) {
-        DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                    __func__, block_index);
-    } else if ((!cacheBlocks[block_index].hasChange) &&
-                (!cacheBlocks[block_index].hasConflict)) {
-        DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                    __func__, block_index);
-    } else {
-        int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-        requestAlarm(spaceNeeded);
-        DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-        "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-        __func__, spaceNeeded);
-        return;
-    }
-
-    // Reducing between tempProp and prop for each item in the cache line.
-    for (int i = 0; i < 4; i++) {
-        uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-        cacheBlocks[block_index].items[i].prop = std::min(
-            cacheBlocks[block_index].items[i].prop,
-            cacheBlocks[block_index].items[i].tempProp);
-        DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                    block_index, i,
-                    cacheBlocks[block_index].items[i].to_string());
-        if (old_prop != cacheBlocks[block_index].items[i].prop) {
-            changedMask |= (1 << i);
-            DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                        __func__, block_index, i);
+    if (cacheBlocks[block_index].takenMask == 0) {
+        if ((cacheBlocks[block_index].hasChange)&&
+            (cacheBlocks[block_index].hasConflict) &&
+            (memReqQueueHasSpace(2))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (cacheBlocks[block_index].hasConflict) &&
+                    (memReqQueueHasSpace(1))) {
+            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
+                        __func__, block_index);
+        } else if ((!cacheBlocks[block_index].hasChange) &&
+                    (!cacheBlocks[block_index].hasConflict)) {
+            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
+                        __func__, block_index);
+        } else {
+            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
+            requestAlarm(spaceNeeded);
+            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
+            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
+            __func__, spaceNeeded);
+            return;
         }
-    }
 
-    if (cacheBlocks[block_index].hasChange) {
-        DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                    , __func__, block_index);
-        // TODO: Parameterize this 64 to memory atom size
-        PacketPtr write_pkt = createWritePacket(
-            cacheBlocks[block_index].addr, 64,
-            (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                    __func__, write_pkt->getAddr());
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
+        // Reducing between tempProp and prop for each item in the cache line.
+        for (int i = 0; i < 4; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                cacheBlocks[block_index].items[i].prop,
+                cacheBlocks[block_index].items[i].tempProp);
+            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
+                        block_index, i,
+                        cacheBlocks[block_index].items[i].to_string());
+            if (old_prop != cacheBlocks[block_index].items[i].prop) {
+                changedMask |= (1 << i);
+                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
+                            __func__, block_index, i);
             }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+        }
+
+        if (cacheBlocks[block_index].hasChange) {
+            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
+                        , __func__, block_index);
+            // TODO: Parameterize this 64 to memory atom size
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, 64,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
+                        __func__, write_pkt->getAddr());
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                enqueueMemReq(read_pkt);
+                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                            "its subsequent read packet (to service the conflicts)"
+                            " to outstandingMemReqQueue.\n" , __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+                // TODO: This should be improved
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for the write back"
+                        " packet.\n", __func__, block_index);
+                enqueueMemReq(write_pkt);
+                stats.numVertexBlockWrites++;
+                DPRINTF(MPU, "%s: Added the write back packet to "
+                            "outstandingMemReqQueue.\n", __func__);
+
+                // TODO: This should be improved
+                if ((changedMask & (1)) == 1) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 0);
+                }
+                if ((changedMask & (2)) == 2) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 1);
+                }
+                if ((changedMask & (4)) == 4) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 2);
+                }
+                if ((changedMask & (8)) == 8) {
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                __func__, block_index, 3);
+                }
+
+                // Since allocated is false, does not matter what the address is.
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
+                            " = %u.\n", __func__, evictQueue.size());
             }
-            // TODO: This should be improved
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            enqueueMemReq(write_pkt);
-            stats.numVertexBlockWrites++;
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // TODO: This should be improved
-            if ((changedMask & (1)) == 1) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 0);
-            }
-            if ((changedMask & (2)) == 2) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 1);
-            }
-            if ((changedMask & (4)) == 4) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 2);
-            }
-            if ((changedMask & (8)) == 8) {
-                peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                            __func__, block_index, 3);
+            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
+                        "backs are necessary.\n", __func__, block_index);
+            if (cacheBlocks[block_index].hasConflict) {
+                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                            "enough space in outstandingMemReqQueue for the write "
+                            "back packet and its subsequent read packet.\n",
+                            __func__, block_index);
+                Addr miss_addr = MSHRMap[block_index][0];
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+                // TODO: parameterize 64
+                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            __func__, miss_addr, aligned_miss_addr);
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+            } else {
+                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
+                            "deallocating the line.\n", __func__, block_index);
+
+                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].takenMask = 0;
+                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
+                                            cacheBlocks[block_index].takenMask);
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
             }
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-            DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
         }
     } else {
-        DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                    "backs are necessary.\n",
-                    __func__, block_index, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write "
-                        "back packet and its subsequent read packet.\n",
-                        __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-            // TODO: parameterize 64
-            Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = 64.\n",
-                        __func__, miss_addr, aligned_miss_addr);
-            enqueueMemReq(read_pkt);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                        "deallocating the line.\n", __func__, block_index);
-
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
+                    "for eviction. Therefore, ignoring the evict schedule.\n",
+                    __func__, block_index);
     }
 
     evictQueue.pop_front();

From 6e7cb504f2c0e2db7e4d1b417994ab53e200ff7c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 09:46:44 -0700
Subject: [PATCH 088/287] Parameterizing cache_size and memory_atom_size.

---
 src/accl/graph/TODO.md                  |  12 ---
 src/accl/graph/base/base_read_engine.cc |   7 +-
 src/accl/graph/base/base_read_engine.hh |   4 +-
 src/accl/graph/sega/CoalesceEngine.py   |   5 +-
 src/accl/graph/sega/coalesce_engine.cc  | 127 +++++++++---------------
 src/accl/graph/sega/coalesce_engine.hh  |  16 +--
 src/accl/graph/sega/push_engine.cc      |   9 +-
 7 files changed, 74 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f6d77d5e22..1cec4dc6f9 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,15 +1,3 @@
 # TODO Items
-
-* use setLE/setBE inside createUpdatePacket and createWritePacket
-* parameterize cache size, associativity, maybe latencies,
-and memory atom size in the coalesce engine
-* look at all the simobjects and come up with a general architecture. Make
-sure all the simobjects follow that architecture.
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
-
-
-Advice from Jason:
-* use tryEnqueueMemReq that returns a boolean that shows if it has succeeded to enqueue the request.
-* if it
-* scratch all of these
\ No newline at end of file
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_read_engine.cc
index 19214a3bd1..714a4542f1 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_read_engine.cc
@@ -36,12 +36,12 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    peerMemoryAtomSize(params.attached_memory_atom_size),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     alarmRequested(false),
     spaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    _requestorId(system->getRequestorId(this))
+    _requestorId(system->getRequestorId(this)),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
 BaseReadEngine::~BaseReadEngine()
@@ -101,6 +101,9 @@ BaseReadEngine::processNextMemReqEvent()
     // TODO: Maybe add a DPRINTF here.
     PacketPtr pkt = outstandingMemReqQueue.front();
     memPort.sendPacket(pkt);
+    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                "pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
     if (alarmRequested &&
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_read_engine.hh
index 0cab95dbbb..f11459ad6e 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_read_engine.hh
@@ -68,8 +68,6 @@ class BaseReadEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int peerMemoryAtomSize;
-
     int outstandingMemReqQueueSize;
     bool alarmRequested;
     int spaceRequested;
@@ -81,6 +79,8 @@ class BaseReadEngine : public ClockedObject
   protected:
     const RequestorID _requestorId;
 
+    size_t peerMemoryAtomSize;
+
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     bool memReqQueueHasSpace(int space);
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 3e5699f552..faa5295ed7 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,10 @@ class CoalesceEngine(BaseReadEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     peer_push_engine = Param.PushEngine(NULL, "")
+
+    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 41d1fe4953..4d152e375d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,21 +38,17 @@ namespace gem5
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     BaseReadEngine(params),
     peerPushEngine(params.peer_push_engine),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
-{}
-
-void
-CoalesceEngine::startup()
 {
-    for (int i = 0; i < 256; i++) {
-        cacheBlocks[i].takenMask = 0;
-        cacheBlocks[i].allocated = false;
-        cacheBlocks[i].valid = false;
-        cacheBlocks[i].hasConflict = false;
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
     }
 }
 
@@ -74,8 +70,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
-    Addr aligned_addr = (addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
+    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
@@ -162,11 +158,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
-                    // TODO: Parameterize 64 to memory atom size
-                    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+
+                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
                     DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = 64.\n",
-                                __func__, addr, aligned_addr);
+                                " req addr (aligned_addr) = %lu, size = %d.\n",
+                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
@@ -240,10 +236,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     Addr addr = pkt->getAddr();
     uint8_t* data = pkt->getPtr<uint8_t>();
-    // TODO: After parameterizing the cache size
-    // this 256 number should change to the cache
-    // size parameter.
-    int block_index = (addr / 64) % 256;
+
+    int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -264,10 +258,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
+        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
 
         if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / 16;
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
@@ -334,9 +328,9 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / 64) * 64;
-    int block_index = (aligned_addr / 64) % 256;
-    int wl_offset = (addr - aligned_addr) / 16;
+    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
@@ -437,12 +431,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         if (cacheBlocks[block_index].hasChange) {
             DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
                         , __func__, block_index);
-            // TODO: Parameterize this 64 to memory atom size
+
             PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, 64,
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = 64.\n",
-                        __func__, write_pkt->getAddr());
+            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
+                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
             if (cacheBlocks[block_index].hasConflict) {
                 DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
                             "enough space in outstandingMemReqQueue for the write "
@@ -451,12 +445,15 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                    aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
-                            __func__, miss_addr, aligned_miss_addr);
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
                 stats.numVertexBlockWrites++;
@@ -465,28 +462,13 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             "its subsequent read packet (to service the conflicts)"
                             " to outstandingMemReqQueue.\n" , __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
-                // TODO: This should be improved
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
                 DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
@@ -509,26 +491,12 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
-                // TODO: This should be improved
-                if ((changedMask & (1)) == 1) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[0]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 0);
-                }
-                if ((changedMask & (2)) == 2) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[1]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 1);
-                }
-                if ((changedMask & (4)) == 4) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[2]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 2);
-                }
-                if ((changedMask & (8)) == 8) {
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[3]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                __func__, block_index, 3);
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    if ((changedMask & (1 << i)) == (1 << i)) {
+                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                    __func__, block_index, i);
+                    }
                 }
 
                 // Since allocated is false, does not matter what the address is.
@@ -555,11 +523,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 Addr miss_addr = MSHRMap[block_index][0];
                 DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
                             " Addr: %lu.\n", __func__, block_index, miss_addr);
-                // TODO: parameterize 64
-                Addr aligned_miss_addr = std::floor(miss_addr / 64) * 64;
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr, 64);
+
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
+                    peerMemoryAtomSize;
+                PacketPtr read_pkt = createReadPacket(
+                        aligned_miss_addr, peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = 64.\n",
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr, aligned_miss_addr);
                 enqueueMemReq(read_pkt);
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6a8aadcbae..0ddbdfdeb1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,7 @@ class CoalesceEngine : public BaseReadEngine
   private:
     struct Block
     {
-        WorkListItem items[4];
+        WorkListItem* items;
         Addr addr;
         uint8_t takenMask;
         bool allocated;
@@ -56,20 +56,26 @@ class CoalesceEngine : public BaseReadEngine
         bool hasChange;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
-        Block():
+        Block() {}
+        Block(int num_elements):
           addr(0),
           takenMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
           hasChange(false)
-        {}
+        {
+          items = new WorkListItem [num_elements];
+        }
     };
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block cacheBlocks[256];
+    Block* cacheBlocks;
+
+    int numLines;
+    int numElementsPerLine;
 
     int numMSHREntry;
     int numTgtsPerMSHR;
@@ -79,8 +85,6 @@ class CoalesceEngine : public BaseReadEngine
 
     std::deque<int> evictQueue;
 
-    virtual void startup();
-
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
     // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 1fced87a43..8dcbac0dcc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -124,8 +124,7 @@ PushEngine::recvWLItem(WorkListItem wl)
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    // TODO: parameterize 64 to memory atom size
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), 64, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -148,7 +147,7 @@ PushEngine::processNextAddrGenEvent()
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
-    PacketPtr pkt = createReadPacket(aligned_addr, 64);
+    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
     reqOffsetMap[pkt->req] = offset;
     reqNumEdgeMap[pkt->req] = num_edges;
     reqValueMap[pkt->req] = curr_info.value();
@@ -202,7 +201,7 @@ PushEngine::processNextPushEvent()
     uint8_t* data = pkt->getPtr<uint8_t>();
 
     Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < 64);
+    assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
     DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
@@ -224,7 +223,7 @@ PushEngine::processNextPushEvent()
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= 64);
+        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
         reqNumEdgeMap[pkt->req]--;
         assert(reqNumEdgeMap[pkt->req] >= 0);
     }

From c216819f0ff4c7103a6f62e416f897068a460e52 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:21:28 -0700
Subject: [PATCH 089/287] Renaming BaseReadEngine to BaseMemEngine.

---
 configs/accl/sega.py                          |  6 ++-
 .../{BaseReadEngine.py => BaseMemEngine.py}   |  8 ++--
 src/accl/graph/base/SConscript                |  4 +-
 ...base_read_engine.cc => base_mem_engine.cc} | 30 +++++++-------
 ...base_read_engine.hh => base_mem_engine.hh} | 20 +++++-----
 src/accl/graph/base/data_structs.hh           |  6 +--
 src/accl/graph/sega/CoalesceEngine.py         |  4 +-
 src/accl/graph/sega/PushEngine.py             |  4 +-
 src/accl/graph/sega/coalesce_engine.cc        | 39 ++-----------------
 src/accl/graph/sega/coalesce_engine.hh        |  4 +-
 src/accl/graph/sega/push_engine.cc            |  4 +-
 src/accl/graph/sega/push_engine.hh            |  4 +-
 12 files changed, 52 insertions(+), 81 deletions(-)
 rename src/accl/graph/base/{BaseReadEngine.py => BaseMemEngine.py} (92%)
 rename src/accl/graph/base/{base_read_engine.cc => base_mem_engine.cc} (87%)
 rename src/accl/graph/base/{base_read_engine.hh => base_mem_engine.hh} (88%)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7f4663cc82..7d8b96490d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -5,9 +5,11 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16)
+                                    push_req_queue_size=16,
+                                    attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine)
+                                    peer_push_engine=self.push_engine,
+                                    attached_memory_atom_size=64)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/base/BaseReadEngine.py b/src/accl/graph/base/BaseMemEngine.py
similarity index 92%
rename from src/accl/graph/base/BaseReadEngine.py
rename to src/accl/graph/base/BaseMemEngine.py
index d4ab622fd6..69f68e9dfc 100644
--- a/src/accl/graph/base/BaseReadEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -29,11 +29,11 @@
 from m5.proxy import *
 from m5.objects.ClockedObject import ClockedObject
 
-class BaseReadEngine(ClockedObject):
+class BaseMemEngine(ClockedObject):
     abstract = True
-    type = 'BaseReadEngine'
-    cxx_header = "accl/graph/base/base_read_engine.hh"
-    cxx_class = 'gem5::BaseReadEngine'
+    type = 'BaseMemEngine'
+    cxx_header = "accl/graph/base/base_mem_engine.hh"
+    cxx_class = 'gem5::BaseMemEngine'
 
     system = Param.System(Parent.any, 'System this Engine is a part of')
     mem_port  = RequestPort("Port to communicate with the memory")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index ea96f4323b..4c90dfa9a6 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,8 +27,8 @@
 
 Import('*')
 
-SimObject('BaseReadEngine.py')
+SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_read_engine.cc')
+Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
diff --git a/src/accl/graph/base/base_read_engine.cc b/src/accl/graph/base/base_mem_engine.cc
similarity index 87%
rename from src/accl/graph/base/base_read_engine.cc
rename to src/accl/graph/base/base_mem_engine.cc
index 714a4542f1..50e64ae7c3 100644
--- a/src/accl/graph/base/base_read_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -26,13 +26,13 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
 namespace gem5
 {
 
-BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
+BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
@@ -44,11 +44,11 @@ BaseReadEngine::BaseReadEngine(const BaseReadEngineParams &params):
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
 
-BaseReadEngine::~BaseReadEngine()
+BaseMemEngine::~BaseMemEngine()
 {}
 
 Port&
-BaseReadEngine::getPort(const std::string &if_name, PortID idx)
+BaseMemEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "mem_port") {
         return memPort;
@@ -58,7 +58,7 @@ BaseReadEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
+BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
@@ -70,14 +70,14 @@ BaseReadEngine::MemPort::sendPacket(PacketPtr pkt)
 }
 
 bool
-BaseReadEngine::MemPort::recvTimingResp(PacketPtr pkt)
+BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
     return owner->handleMemResp(pkt);
 }
 
 void
-BaseReadEngine::MemPort::recvReqRetry()
+BaseMemEngine::MemPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
@@ -92,7 +92,7 @@ BaseReadEngine::MemPort::recvReqRetry()
 }
 
 void
-BaseReadEngine::processNextMemReqEvent()
+BaseMemEngine::processNextMemReqEvent()
 {
     if (memPort.blocked()) {
         return;
@@ -120,7 +120,7 @@ BaseReadEngine::processNextMemReqEvent()
 }
 
 PacketPtr
-BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
+BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
@@ -135,7 +135,7 @@ BaseReadEngine::createReadPacket(Addr addr, unsigned int size)
 }
 
 PacketPtr
-BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 {
     RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
 
@@ -151,7 +151,7 @@ BaseReadEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseReadEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::memReqQueueHasSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -160,14 +160,14 @@ BaseReadEngine::memReqQueueHasSpace(int space)
 }
 
 bool
-BaseReadEngine::memReqQueueFull()
+BaseMemEngine::memReqQueueFull()
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
 }
 
 void
-BaseReadEngine::enqueueMemReq(PacketPtr pkt)
+BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
@@ -179,7 +179,7 @@ BaseReadEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseReadEngine::requestAlarm(int space) {
+BaseMemEngine::requestAlarm(int space) {
     panic_if((alarmRequested == true) || (spaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
@@ -189,7 +189,7 @@ BaseReadEngine::requestAlarm(int space) {
 }
 
 void
-BaseReadEngine::wakeUp()
+BaseMemEngine::wakeUp()
 {
     if ((!nextMemReqEvent.scheduled()) &&
         (!outstandingMemReqQueue.empty())) {
diff --git a/src/accl/graph/base/base_read_engine.hh b/src/accl/graph/base/base_mem_engine.hh
similarity index 88%
rename from src/accl/graph/base/base_read_engine.hh
rename to src/accl/graph/base/base_mem_engine.hh
index f11459ad6e..fb7cab91b0 100644
--- a/src/accl/graph/base/base_read_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -26,33 +26,33 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_READ_ENGINE_HH__
+#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
+#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
 
 #include <unordered_map>
 
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "params/BaseReadEngine.hh"
+#include "params/BaseMemEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
 
 namespace gem5
 {
 
-class BaseReadEngine : public ClockedObject
+class BaseMemEngine : public ClockedObject
 {
   private:
     class MemPort : public RequestPort
     {
       private:
-        BaseReadEngine* owner;
+        BaseMemEngine* owner;
         bool _blocked;
         PacketPtr blockedPacket;
 
         public:
-        MemPort(const std::string& name, BaseReadEngine* owner):
+        MemPort(const std::string& name, BaseMemEngine* owner):
             RequestPort(name, owner), owner(owner),
             _blocked(false), blockedPacket(nullptr)
         {}
@@ -96,10 +96,10 @@ class BaseReadEngine : public ClockedObject
     PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
   public:
-    PARAMS(BaseReadEngine);
+    PARAMS(BaseMemEngine);
 
-    BaseReadEngine(const BaseReadEngineParams &params);
-    ~BaseReadEngine();
+    BaseMemEngine(const BaseMemEngineParams &params);
+    ~BaseMemEngine();
 
     Port& getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
@@ -116,4 +116,4 @@ class BaseReadEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_GRAPH_BASE_BASE_APPLY_ENGINE_HH__
+#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 28a503528f..409245eeaa 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -26,8 +26,8 @@
  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
  */
 
-#ifndef __ACCL_GRAPH_BASE_UTIL_HH__
-#define __ACCL_GRAPH_BASE_UTIL_HH__
+#ifndef __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
+#define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
 
@@ -83,4 +83,4 @@ struct __attribute__ ((packed)) Edge
 
 }
 
-#endif // __ACCL_GRAPH_BASE_UTIL_HH__
+#endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index faa5295ed7..086f284950 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class CoalesceEngine(BaseReadEngine):
+class CoalesceEngine(BaseMemEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 645bc5f4ea..d3276799aa 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseReadEngine import BaseReadEngine
+from m5.objects.BaseMemEngine import BaseMemEngine
 
-class PushEngine(BaseReadEngine):
+class PushEngine(BaseMemEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d152e375d..1c5dee8b8f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -36,7 +36,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -83,12 +83,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-        // TODO: Use a bitset instead of unsigned int for takenMask
-        DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
+
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-        DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
 
         stats.readHits++;
         stats.numVertexReads++;
@@ -144,11 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].takenMask = 0;
-                    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -271,11 +263,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
-            DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
             stats.numVertexReads++;
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
@@ -342,11 +330,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                        cacheBlocks[block_index].takenMask);
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -413,7 +397,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
         }
 
         // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < 4; i++) {
+        for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             cacheBlocks[block_index].items[i].prop = std::min(
                 cacheBlocks[block_index].items[i].prop,
@@ -471,11 +455,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -500,11 +480,8 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 }
 
                 // Since allocated is false, does not matter what the address is.
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
+
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
@@ -535,11 +512,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
@@ -548,11 +521,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
                             "deallocating the line.\n", __func__, block_index);
 
-                DPRINTF(MPU, "%s: takenMask[%d] before: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].takenMask = 0;
-                DPRINTF(MPU, "%s: takenMask[%d] after: %u.\n", __func__, block_index,
-                                            cacheBlocks[block_index].takenMask);
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0ddbdfdeb1..4c4cb4567b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -42,7 +42,7 @@ namespace gem5
 
 class WLEngine;
 
-class CoalesceEngine : public BaseReadEngine
+class CoalesceEngine : public BaseMemEngine
 {
   private:
     struct Block
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8dcbac0dcc..53cb428b12 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,7 +35,7 @@ namespace gem5
 {
 
 PushEngine::PushEngine(const PushEngineParams &params):
-    BaseReadEngine(params),
+    BaseMemEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -49,7 +49,7 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     if (if_name == "req_port") {
         return reqPort;
     } else if (if_name == "mem_port") {
-        return BaseReadEngine::getPort(if_name, idx);
+        return BaseMemEngine::getPort(if_name, idx);
     } else {
         return SimObject::getPort(if_name, idx);
     }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 29d18709ee..5e8b079d88 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,14 +29,14 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_read_engine.hh"
+#include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
 {
 
-class PushEngine : public BaseReadEngine
+class PushEngine : public BaseMemEngine
 {
   private:
     class PushPacketInfoGen {

From 293cb52c7cd6175ee9f5e8e279a363b781ca0b15 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 10:30:08 -0700
Subject: [PATCH 090/287] Adding a new SConscript for src/accl.

---
 configs/accl/sega.py           |  4 ++--
 src/accl/graph/SConscript      | 30 ++++++++++++++++++++++++++++++
 src/accl/graph/sega/SConscript |  2 +-
 3 files changed, 33 insertions(+), 3 deletions(-)
 create mode 100644 src/accl/graph/SConscript

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7d8b96490d..4168217f4d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="facebook/graph_binaries/vertices",
+            vertex_binary="graphs/facebook/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
new file mode 100644
index 0000000000..00fa2466dd
--- /dev/null
+++ b/src/accl/graph/SConscript
@@ -0,0 +1,30 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2016 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+Import('*')
+
+DebugFlag('MPU')
\ No newline at end of file
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 9b4629838b..6e563b2677 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('MPU')
+DebugFlag('WLWrites')

From 5df2ae29e0faaa80cda5721ad137cdc84b6235e8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 13 Apr 2022 14:11:32 -0700
Subject: [PATCH 091/287] Fixing stats and adding a few new ones.

---
 configs/accl/sega.py                   |  4 +--
 src/accl/graph/sega/SConscript         |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 43 ++++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  4 +--
 4 files changed, 35 insertions(+), 18 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 4168217f4d..0532aa2153 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/facebook/graph_binaries/vertices",
+            vertex_binary="graphs/epinions/graph_binaries/vertices",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/facebook/graph_binaries/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 6e563b2677..19d702c49a 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -35,4 +35,4 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('WLWrites')
+DebugFlag('ApplyUpdates')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c5dee8b8f..36a7ddb6d2 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -83,16 +84,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
-
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-
         stats.readHits++;
-        stats.numVertexReads++;
 
         assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -105,6 +104,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                stats.readRejections++;
                 return false;
             } else {
                 DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
@@ -117,12 +117,15 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                                 "line[%d]", __func__, addr, block_index);
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
@@ -137,6 +140,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     if (memReqQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
+                        stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
@@ -158,7 +162,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     enqueueMemReq(pkt);
                     DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
-                    stats.numVertexBlockReads++;
+                    stats.readMisses++;
+                    stats.numVertexReads++;
                     return true;
                 }
             }
@@ -169,6 +174,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
+                stats.readRejections++;
                 return false;
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
@@ -178,9 +184,17 @@ CoalesceEngine::recvReadAddr(Addr addr)
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
             }
+
+            if (aligned_addr != cacheBlocks[block_index].addr) {
+                stats.readMisses++;
+            } else {
+                stats.readHits++;
+            }
+
             MSHRMap[block_index].push_back(addr);
             DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
             return true;
         }
     }
@@ -264,7 +278,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
-            stats.numVertexReads++;
+
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
@@ -334,7 +348,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
-
     // TODO: Make this more general and programmable.
     // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
     // to evictQueue.
@@ -440,7 +453,6 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                             __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
 
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 enqueueMemReq(read_pkt);
                 DPRINTF(MPU, "%s: Added the evicting write back packet along with "
                             "its subsequent read packet (to service the conflicts)"
@@ -448,6 +460,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -467,12 +482,14 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         "enough space in outstandingMemReqQueue for the write back"
                         " packet.\n", __func__, block_index);
                 enqueueMemReq(write_pkt);
-                stats.numVertexBlockWrites++;
                 DPRINTF(MPU, "%s: Added the write back packet to "
                             "outstandingMemReqQueue.\n", __func__);
 
                 for (int i = 0; i < numElementsPerLine; i++) {
                     if ((changedMask & (1 << i)) == (1 << i)) {
+                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
+                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                        cacheBlocks[block_index].items[i].to_string());
                         peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
                         DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
                                     __func__, block_index, i);
@@ -548,16 +565,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
 
-    ADD_STAT(numVertexBlockReads, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
-    ADD_STAT(numVertexBlockWrites, statistics::units::Count::get(),
-             "Number of memory blocks writes for vertecies"),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
              "Number of memory vertecies written to cache."),
     ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits.")
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readRejections, statistics::units::Count::get(),
+             "Number of cache rejections.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4c4cb4567b..efd19d3e9b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,11 +102,11 @@ class CoalesceEngine : public BaseMemEngine
 
       CoalesceEngine &coalesce;
 
-      statistics::Scalar numVertexBlockReads;
-      statistics::Scalar numVertexBlockWrites;
       statistics::Scalar numVertexReads;
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
+      statistics::Scalar readMisses;
+      statistics::Scalar readRejections;
     };
 
     CoalesceStats stats;

From 4e169aa65eb3e7e1302c66c4031695515d613fff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 15 Apr 2022 15:21:34 -0700
Subject: [PATCH 092/287] Fixing memory atom size issue.

---
 configs/accl/sega.py                   | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0532aa2153..61df2cc2ef 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ def __init__(self, base_edge_addr):
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=64)
+                                    attached_memory_atom_size=32)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=16,
                                 on_the_fly_update_map_size=8)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36a7ddb6d2..e54447fd09 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -251,7 +251,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
 
-    for (int i = 0; i < 4; i++) {
+    for (int i = 0; i < numElementsPerLine; i++) {
         cacheBlocks[block_index].items[i] = *((WorkListItem*) (
                                 data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 53cb428b12..195cb65dbc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -185,6 +185,8 @@ PushEngine::respondToAlarm()
 bool
 PushEngine::handleMemResp(PacketPtr pkt)
 {
+    // TODO: in case we need to edit edges, get rid of second statement.
+    assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {

From 7f52d64d0433af8ec9727ef6e6d18c297e039f8e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Apr 2022 13:34:12 -0700
Subject: [PATCH 093/287] Removing dead code.

---
 configs/accl/sega.py               | 4 ++--
 src/accl/graph/sega/push_engine.cc | 5 -----
 2 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 61df2cc2ef..450f158f93 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,9 +64,9 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices",
+            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
+            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
 
         self.mpu.setReqPort(self.mpu.getRespPort())
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 195cb65dbc..716daf92e8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -58,11 +58,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 void
 PushEngine::startup()
 {
-    uint8_t* first_update_data = new uint8_t [4];
-    uint32_t* tempPtr = (uint32_t*) first_update_data;
-    *tempPtr = 0;
-
-    // PacketPtr first_update = createUpdatePacket(0, 4, first_update_data);
     PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
 
     if (!reqPort.blocked()) {

From 2ca8a986a07d819484f5bc40d18101481d6cdf40 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 19 Apr 2022 12:03:25 -0700
Subject: [PATCH 094/287] [WIP] added the central control unit.

It has error about the crossbar
---
 configs/accl/sega.py                       |  10 +-
 src/accl/graph/sega/CenteralController.py  |  39 +++++++
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/centeral_controller.cc | 123 +++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  84 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  10 --
 src/accl/graph/sega/push_engine.hh         |   2 -
 src/accl/graph/sega/wl_engine.cc           |   6 +
 src/accl/graph/sega/wl_engine.hh           |   2 +
 9 files changed, 263 insertions(+), 15 deletions(-)
 create mode 100644 src/accl/graph/sega/CenteralController.py
 create mode 100644 src/accl/graph/sega/centeral_controller.cc
 create mode 100644 src/accl/graph/sega/centeral_controller.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 450f158f93..c4288c92d3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,14 +61,18 @@ def __init__(self):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test-graph/graph_binaries/vertices_0",
+            vertex_binary="graphs/test/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test-graph/graph_binaries/edgelist_0")
+            edge_binary="graphs/test/edgelist_0")
+        self.interconnect = SystemXBar()
 
-        self.mpu.setReqPort(self.mpu.getRespPort())
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
+        self.mpu.setRespPort(self.interconnect.mem_side_ports)
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
new file mode 100644
index 0000000000..7b00f8b12d
--- /dev/null
+++ b/src/accl/graph/sega/CenteralController.py
@@ -0,0 +1,39 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class CenteralController(ClockedObject):
+    type = 'CenteralController'
+    cxx_header = "accl/graph/sega/centeral_controller.hh"
+    cxx_class = 'gem5::CenteralController'
+
+    req_port  = RequestPort("Port to send updates to the outside")
+    addr = Param.Addr("")
+    value = Param.Int(0, "")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 19d702c49a..c8810bbdb2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,10 +27,12 @@
 
 Import('*')
 
+SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
new file mode 100644
index 0000000000..daa2d9b390
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -0,0 +1,123 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/centeral_controller.hh"
+
+#include "mem/packet_access.hh"
+
+namespace gem5
+{
+
+CenteralController::CenteralController
+                    (const CenteralControllerParams &params):
+    ClockedObject(params),
+    reqPort(name() + ".req_port", this),
+    addr(params.addr),
+    value(params.value)
+{}
+
+Port&
+CenteralController::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "req_port") {
+        return reqPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+CenteralController::startup()
+{
+    PacketPtr first_update = 
+                createUpdatePacket<uint32_t>(addr, value);
+
+    if (!reqPort.blocked()) {
+        reqPort.sendPacket(first_update);
+    }
+}
+
+template<typename T> PacketPtr
+CenteralController::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(
+                addr, sizeof(T), addr, value);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) value) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+// AddrRangeList
+// CenteralController::ReqPort::getAddrRanges() const
+// {
+//     AddrRangeList ret;
+//     ret.clear();
+//     return ret;
+// }
+
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!_blocked) {
+        blockedPacket = nullptr;
+    }
+}
+
+}
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
new file mode 100644
index 0000000000..0e1bb6ac80
--- /dev/null
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+#define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "params/CenteralController.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class CenteralController : public ClockedObject
+{
+  private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner) :
+          RequestPort(name, owner), owner(owner),
+          _blocked(false), blockedPacket(nullptr)
+        {}
+        // virtual AddrRangeList getAddrRanges() const;
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    ReqPort reqPort;
+
+    Addr addr;
+    uint32_t value;
+
+    template<typename T> PacketPtr 
+                              createUpdatePacket(Addr addr, T value);
+
+    virtual void startup();
+
+  public:
+    PARAMS(CenteralController);
+    CenteralController(const CenteralControllerParams &params);
+
+    Port& getPort(const std::string &if_name,
+                PortID idx=InvalidPortID) override;
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 716daf92e8..ddfc2edef8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -55,16 +55,6 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
-void
-PushEngine::startup()
-{
-    PacketPtr first_update = createUpdatePacket<uint32_t>(0, (uint32_t) 0);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
-    }
-}
-
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5e8b079d88..ce9045e91a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -112,8 +112,6 @@ class PushEngine : public BaseMemEngine
     // always be limited by the b/w of the memory.
     std::deque<PacketPtr> memRespQueue;
 
-    virtual void startup();
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextAddrGenEvent;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ad9e93ba60..40fca42d26 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -58,6 +58,12 @@ WLEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+WLEngine::init()
+{
+    respPort.sendRangeChange();
+}
+
 AddrRangeList
 WLEngine::RespPort::getAddrRanges() const
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 12df93ee79..2698ce3ea8 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -65,6 +65,8 @@ class WLEngine : public BaseReduceEngine
         virtual void recvRespRetry();
     };
 
+    virtual void init();
+
     RespPort respPort;
 
     bool blockedByCoalescer;

From a95da7b0dc83e976b444f5304e818ffe96adf90e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 22 Apr 2022 11:44:24 -0700
Subject: [PATCH 095/287] Adding UpdateWL as a MemCmd and fixing code.

---
 configs/accl/sega.py                       |   5 +-
 src/accl/graph/TODO.md                     |   5 +
 src/accl/graph/base/data_structs.hh        |   3 +
 src/accl/graph/sega/centeral_controller.cc |  14 +-
 src/accl/graph/sega/coalesce_engine.cc     | 195 +++++++++------------
 src/accl/graph/sega/push_engine.cc         |   2 +-
 src/accl/graph/sega/wl_engine.cc           |  31 +---
 src/mem/packet.cc                          |  40 +----
 src/mem/packet.hh                          |   4 +-
 9 files changed, 105 insertions(+), 194 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c4288c92d3..aa3675d847 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -65,14 +65,15 @@ def __init__(self):
         self.mpu = MPU(base_edge_addr=0x80000000)
         self.mem_ctrl = MPUMemory(
             vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/test/vertices_0",
+            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
             edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/test/edgelist_0")
+            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
         self.interconnect = SystemXBar()
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
         self.mpu.setReqPort(self.interconnect.cpu_side_ports)
         self.mpu.setRespPort(self.interconnect.mem_side_ports)
+
         self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
         self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
 
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 1cec4dc6f9..f5690a3faa 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,3 +1,8 @@
 # TODO Items
+
 * implement all the communications between simobjects as req/retry.
 * get rid of maps with RequestPtr as keys
+* add UpdateWL as a MemCmd
+* Replace std::floor with roundDown from intmath.hh in src
+* We might need to revisit the fact that we could insert something to a queue on
+    the same cycle that another event is consuming something from the queue.
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 409245eeaa..7535d4bbac 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -30,6 +30,7 @@
 #define __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
 
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 
 namespace gem5
 {
@@ -81,6 +82,8 @@ struct __attribute__ ((packed)) Edge
     {}
 };
 
+static_assert(isPowerOf2(sizeof(WorkListItem)));
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index daa2d9b390..41ebeb9cd6 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -54,8 +54,7 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = 
-                createUpdatePacket<uint32_t>(addr, value);
+    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -71,8 +70,7 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) value) << 2);
 
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
@@ -81,14 +79,6 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-// AddrRangeList
-// CenteralController::ReqPort::getAddrRanges() const
-// {
-//     AddrRangeList ret;
-//     ret.clear();
-//     return ret;
-// }
-
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e54447fd09..e6503ea01d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 
 #include "accl/graph/sega/wl_engine.hh"
+#include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
@@ -47,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
     stats(*this)
 {
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
     cacheBlocks = new Block [numLines];
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
@@ -72,18 +74,25 @@ CoalesceEngine::recvReadAddr(Addr addr)
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextRespondEvent for latency cycles in
+        // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset, responseQueue.size(),
             cacheBlocks[block_index].items[wl_offset].to_string());
+        // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].takenMask |= (1 << wl_offset);
         stats.readHits++;
 
@@ -104,6 +113,8 @@ CoalesceEngine::recvReadAddr(Addr addr)
                 // Out of MSHR entries
                 DPRINTF(MPU, "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
@@ -200,6 +211,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
     }
 }
 
+// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
 {
@@ -241,8 +253,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
     DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
@@ -250,17 +260,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
             (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        cacheBlocks[block_index].items[i] = *((WorkListItem*) (
-                                data + (i * sizeof(WorkListItem))));
         DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
     delete pkt;
 
-    int bias = 0;
+    // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
@@ -271,20 +281,26 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
+            // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            // End of the said block
 
             servicedIndices.push_back(i);
             DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
+
     // TODO: We Can use taken instead of this
+    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
         Addr print_addr = MSHRMap[block_index][i - bias];
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
@@ -298,8 +314,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
-        // TODO: I think this is unnecessary.
-        cacheBlocks[block_index].hasConflict = true;
+        assert(cacheBlocks[block_index].hasConflict);
     }
 
     if ((!nextRespondEvent.scheduled()) &&
@@ -341,11 +356,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
         cacheBlocks[block_index].hasChange = true;
+        stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
-    stats.numVertexWrites++;
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
     // TODO: Make this more general and programmable.
@@ -380,8 +395,9 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 __func__, block_index);
     DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
                 "then commited.\n", __func__, block_index);
+
     if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange)&&
+        if ((cacheBlocks[block_index].hasChange) &&
             (cacheBlocks[block_index].hasConflict) &&
             (memReqQueueHasSpace(2))) {
             DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
@@ -420,6 +436,7 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                         cacheBlocks[block_index].items[i].to_string());
             if (old_prop != cacheBlocks[block_index].items[i].prop) {
                 changedMask |= (1 << i);
+                // TODO: Add a stat to count the number of changed props.
                 DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
                             __func__, block_index, i);
             }
@@ -434,117 +451,65 @@ CoalesceEngine::processNextApplyAndCommitEvent()
                 (uint8_t*) cacheBlocks[block_index].items);
             DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
                         __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                    aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-
-                enqueueMemReq(write_pkt);
-                enqueueMemReq(read_pkt);
-                DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                            "its subsequent read packet (to service the conflicts)"
-                            " to outstandingMemReqQueue.\n" , __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
-                }
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for the write back"
-                        " packet.\n", __func__, block_index);
-                enqueueMemReq(write_pkt);
-                DPRINTF(MPU, "%s: Added the write back packet to "
-                            "outstandingMemReqQueue.\n", __func__);
-
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    if ((changedMask & (1 << i)) == (1 << i)) {
-                        DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n", __func__,
-                        cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                        cacheBlocks[block_index].items[i].to_string());
-                        peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                        DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                    __func__, block_index, i);
-                    }
+            enqueueMemReq(write_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet to "
+                        "outstandingMemReqQueue.\n" , __func__);
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                if ((changedMask & (1 << i)) == (1 << i)) {
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
+                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
+                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
+                                                        __func__, block_index, i);
                 }
-
-                // Since allocated is false, does not matter what the address is.
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-                DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                            " = %u.\n", __func__, evictQueue.size());
             }
-        } else {
-            DPRINTF(MPU, "%s: No item from cache line[%d] has changed. No write "
-                        "backs are necessary.\n", __func__, block_index);
-            if (cacheBlocks[block_index].hasConflict) {
-                DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                            "enough space in outstandingMemReqQueue for the write "
-                            "back packet and its subsequent read packet.\n",
-                            __func__, block_index);
-                Addr miss_addr = MSHRMap[block_index][0];
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                            " Addr: %lu.\n", __func__, block_index, miss_addr);
+        }
 
-                Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
+        if (cacheBlocks[block_index].hasConflict) {
+            assert(!MSHRMap[block_index].empty());
+            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
+                        "enough space in outstandingMemReqQueue for a read "
+                        "packet.\n", __func__, block_index);
+            Addr miss_addr = MSHRMap[block_index][0];
+            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
+                        " Addr: %lu.\n", __func__, block_index, miss_addr);
+
+            Addr aligned_miss_addr =
+                std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-                PacketPtr read_pkt = createReadPacket(
-                        aligned_miss_addr, peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr, aligned_miss_addr);
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
-            } else {
-                DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. Just "
-                            "deallocating the line.\n", __func__, block_index);
-
-                cacheBlocks[block_index].takenMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
-            }
+            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                    peerMemoryAtomSize);
+            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                        " req addr (aligned_addr) = %lu, size = %d.\n",
+                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
+            enqueueMemReq(read_pkt);
+            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
+                        "its subsequent read packet (to service the conflicts)"
+                        " to outstandingMemReqQueue.\n" , __func__);
+
+            cacheBlocks[block_index].addr = aligned_miss_addr;
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = true;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = true;
+            cacheBlocks[block_index].hasChange = false;
+        } else {
+            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
+                    "enough space in outstandingMemReqQueue for the write back"
+                    " packet.\n", __func__, block_index);
+            DPRINTF(MPU, "%s: Added the write back packet to "
+                        "outstandingMemReqQueue.\n", __func__);
+
+            // Since allocated is false, does not matter what the address is.
+            cacheBlocks[block_index].takenMask = 0;
+            cacheBlocks[block_index].allocated = false;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].hasChange = false;
         }
+
     } else {
         DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
                     "for eviction. Therefore, ignoring the evict schedule.\n",
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ddfc2edef8..e822b7168b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -238,7 +238,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     req->setPC(((Addr) _requestorId) << 2);
 
     // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
     // pkt->setData(data);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 40fca42d26..148f5de5be 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,8 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+// TODO: Parameterize the number of pops WLEngine can do at a time.
+// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
 WLEngine::processNextReadEvent()
 {
@@ -144,9 +146,7 @@ WLEngine::processNextReadEvent()
                 DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
-                if (updateQueue.size() == updateQueueSize - 1) {
-                    respPort.checkRetryReq();
-                }
+                respPort.checkRetryReq();
             }
         }
     } else {
@@ -164,9 +164,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
-        if (updateQueue.size() == updateQueueSize - 1) {
-            respPort.checkRetryReq();
-        }
+        respPort.checkRetryReq();
     }
 
     // TODO: Only schedule nextReadEvent only when it has to be scheduled
@@ -194,12 +192,9 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    std::unordered_map<Addr, WorkListItem>::iterator it =
-                    addrWorkListMap.begin();
-
-    std::vector<Addr> servicedAddresses;
-    while (it != addrWorkListMap.end()) {
-        Addr addr = it->first;
+    for (auto &it : addrWorkListMap) {
+        Addr addr = it.first;
+        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
         DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
@@ -214,17 +209,9 @@ WLEngine::processNextReduceEvent()
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        servicedAddresses.push_back(addr);
-        DPRINTF(MPU, "%s: Added addr: %lu to servicedAdresses.\n",
-                    __func__, addr);
-        it++;
-    }
-
-    addrWorkListMap.clear();
-    for (int i = 0; i < servicedAddresses.size(); i++) {
-        onTheFlyUpdateMap.erase(servicedAddresses[i]);
+        onTheFlyUpdateMap.erase(addr);
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, servicedAddresses[i]);
+                    __func__, addr);
     }
 }
 
diff --git a/src/mem/packet.cc b/src/mem/packet.cc
index da45246e49..daf9d18e88 100644
--- a/src/mem/packet.cc
+++ b/src/mem/packet.cc
@@ -237,6 +237,7 @@ MemCmd::commandInfo[] =
     { {IsRead, IsResponse}, InvalidCmd, "HTMReqResp" },
     { {IsRead, IsRequest}, InvalidCmd, "HTMAbort" },
     { {IsRequest}, InvalidCmd, "TlbiExtSync" },
+    { {IsRequest, HasData}, InvalidCmd, "UpdateWL"}
 };
 
 AddrRange
@@ -532,43 +533,4 @@ Packet::getHtmTransactionUid() const
     return htmTransactionUid;
 }
 
-std::string
-Packet::printData()
-{
-    char ret[1024];
-    if (isWrite()) {
-        uint8_t* data = getPtr<uint8_t>();
-        std::sprintf(ret,"\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n"
-                        "V[%lu] temp_prop: %u, prop: %u, "
-                        "degree: %u, edgeIndex: %u.\n",
-                        getAddr(),
-                        *((uint32_t*) data),
-                        *((uint32_t*) (data + 4)),
-                        *((uint32_t*) (data + 8)),
-                        *((uint32_t*) (data + 12)),
-                        getAddr() + 16,
-                        *((uint32_t*) (data + 16)),
-                        *((uint32_t*) (data + 20)),
-                        *((uint32_t*) (data + 24)),
-                        *((uint32_t*) (data + 28)),
-                        getAddr() + 32,
-                        *((uint32_t*) (data + 32)),
-                        *((uint32_t*) (data + 36)),
-                        *((uint32_t*) (data + 40)),
-                        *((uint32_t*) (data + 44)),
-                        getAddr() + 48,
-                        *((uint32_t*) (data + 48)),
-                        *((uint32_t*) (data + 52)),
-                        *((uint32_t*) (data + 56)),
-                        *((uint32_t*) (data + 60)));
-    }
-    return ret;
-}
-
 } // namespace gem5
diff --git a/src/mem/packet.hh b/src/mem/packet.hh
index 8803eacced..5332ee32a2 100644
--- a/src/mem/packet.hh
+++ b/src/mem/packet.hh
@@ -149,7 +149,7 @@ class MemCmd
         // Tlb shootdown
         TlbiExtSync,
         // MPU Accelerator
-        // UpdateWL,
+        UpdateWL,
         NUM_MEM_CMDS
     };
 
@@ -1374,8 +1374,6 @@ class Packet : public Printable
     template <typename T>
     void setRaw(T v);
 
-    std::string printData();
-
   public:
     /**
      * Check a functional request against a memory value stored in

From e4b665c796dbe348a511585c3eb2c1b3d87630b4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Apr 2022 20:28:25 -0700
Subject: [PATCH 096/287] A little bit of debugging and updating config script.

---
 configs/accl/sega.py             | 138 +++++++++++++++++++++++--------
 src/accl/graph/TODO.md           |   5 +-
 src/accl/graph/sega/wl_engine.cc |   1 +
 src/accl/graph/sega/wl_engine.hh |   2 +-
 4 files changed, 105 insertions(+), 41 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index aa3675d847..9dd8c0f358 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,5 +1,9 @@
 import m5
+import argparse
+
+from math import log
 from m5.objects import *
+from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
@@ -35,53 +39,115 @@ def setEdgeMemPort(self, port):
         self.push_engine.mem_port = port
 
 class MPUMemory(SubSystem):
-    def __init__(self, vertex_range, vertex_binary, edge_range, edge_binary):
+    def __init__(self,
+                    num_channels: int,
+                    cache_line_size: int,
+                    vertex_memory_size: str,
+                    edge_memory_size: str,
+                    graph_path: str):
         super(MPUMemory, self).__init__()
-        self.vertex_mem_ctrl = SimpleMemory(
-            range=vertex_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=vertex_binary)
-        self.edge_mem_ctrl = SimpleMemory(
-            range=edge_range, bandwidth="19.2GB/s",
-            latency="30ns", image_file=edge_binary)
-
-    def getVertexPort(self):
-        return self.vertex_mem_ctrl.port
-    def setVertexPort(self, port):
-        self.vertex_mem_ctrl.port = port
-
-    def getEdgePort(self):
-        return self.edge_mem_ctrl.port
-    def setEdgePort(self, port):
-        self.edge_mem_ctrl.port = port
+
+        self._vertex_ranges = self._interleave_addresses(
+                                AddrRange(start=0, size=vertex_memory_size),\
+                                num_channels,\
+                                cache_line_size)
+
+        self._edge_chunk_size = int(\
+                                toMemorySize(edge_memory_size)/num_channels)
+        self._edge_ranges = [AddrRange(\
+                            start=toMemorySize(vertex_memory_size)+\
+                            self._edge_chunk_size*i,\
+                            size=self._edge_chunk_size)\
+                            for i in range(num_channels)]
+
+        vertex_mem_ctrl = []
+        edge_mem_ctrl = []
+        for i in range(num_channels):
+            vertex_mem_ctrl.append(
+                SimpleMemory(range=self._vertex_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/vertices_{i}")
+            )
+            edge_mem_ctrl.append(
+                SimpleMemory(range=self._edge_ranges[i],
+                            bandwidth="19.2GB/s",
+                            latency="30ns",
+                            image_file=f"{graph_path}/edgelist_{i}")
+            )
+        self.vertex_mem_ctrl = vertex_mem_ctrl
+        self.edge_mem_ctrl = edge_mem_ctrl
+
+    def _interleave_addresses(self,
+                            plain_range,
+                            num_channels,
+                            cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+    def getVertexPort(self, i):
+        return self.vertex_mem_ctrl[i].port
+    def setVertexPort(self, port, i):
+        self.vertex_mem_ctrl[i].port = port
+
+    def getEdgeBaseAddr(self, i):
+        return self._edge_ranges[i].start
+    def getEdgePort(self, i):
+        return self.edge_mem_ctrl[i].port
+    def setEdgePort(self, port, i):
+        self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self):
+    def __init__(self, num_mpus, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
 
-        self.ctrl = CenteralController(addr=0, value=0)
-        self.mpu = MPU(base_edge_addr=0x80000000)
-        self.mem_ctrl = MPUMemory(
-            vertex_range=AddrRange(start=0x000000, size="2GiB"),
-            vertex_binary="graphs/epinions/graph_binaries/vertices_0",
-            edge_range=AddrRange(start=0x80000000, size="2GiB"),
-            edge_binary="graphs/epinions/graph_binaries/edgelist_0")
-        self.interconnect = SystemXBar()
+        self.interconnect = NoncoherentXBar(frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64)
 
+        self.ctrl = CenteralController(addr=0, value=0)
         self.ctrl.req_port = self.interconnect.cpu_side_ports
-        self.mpu.setReqPort(self.interconnect.cpu_side_ports)
-        self.mpu.setRespPort(self.interconnect.mem_side_ports)
 
-        self.mpu.setVertexMemPort(self.mem_ctrl.getVertexPort())
-        self.mpu.setEdgeMemPort(self.mem_ctrl.getEdgePort())
+        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+
+        mpus = []
+        for i in range(num_mpus):
+            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
+            mpus[i].setRespPort(self.interconnect.mem_side_ports)
+            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
+            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
+        self.mpu = mpus
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("graph_path", type=str)
+    args = argparser.parse_args()
+    return args.num_mpus, args.graph_path
 
-system = SEGA()
-root = Root(full_system = False, system = system)
+if __name__ == "__m5_main__":
+    num_mpus, graph_path = get_inputs()
+    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
+    system = SEGA(num_mpus, graph_path)
+    root = Root(full_system = False, system = system)
 
-m5.instantiate()
+    m5.instantiate()
 
-exit_event = m5.simulate()
-print("Simulation finished!")
-exit()
+    exit_event = m5.simulate()
+    print("Simulation finished!")
+    exit()
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index f5690a3faa..29b5a2939e 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,8 +1,5 @@
 # TODO Items
 
-* implement all the communications between simobjects as req/retry.
-* get rid of maps with RequestPtr as keys
-* add UpdateWL as a MemCmd
 * Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
+    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 148f5de5be..e949cbcf5b 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -213,6 +213,7 @@ WLEngine::processNextReduceEvent()
         DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
                     __func__, addr);
     }
+    addrWorkListMap.clear();
 }
 
 bool
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2698ce3ea8..597fdb2b1e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -52,7 +52,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner)
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
         {}
         virtual AddrRangeList getAddrRanges() const;
 

From c8b7b26fcc071883bb70cbaf31b936249a4b20be Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Apr 2022 16:56:04 -0700
Subject: [PATCH 097/287] Adding initState to CenteralController.

---
 configs/accl/sega.py                       | 23 ++++++++++------
 src/accl/graph/sega/CenteralController.py  |  3 ++
 src/accl/graph/sega/centeral_controller.cc | 32 ++++++++++++++++++++++
 src/accl/graph/sega/centeral_controller.hh |  6 +++-
 4 files changed, 55 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9dd8c0f358..0907ba77de 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -66,8 +66,7 @@ def __init__(self,
             vertex_mem_ctrl.append(
                 SimpleMemory(range=self._vertex_ranges[i],
                             bandwidth="19.2GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/vertices_{i}")
+                            latency="30ns")
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
@@ -108,21 +107,28 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, graph_path):
+    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = vertex_cache_line_size
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0)
+        self.ctrl = CenteralController(addr=0, value=0,
+                                    image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(num_mpus, 32, "2GiB", "2GiB", graph_path)
+        self.mem_ctrl = MPUMemory(
+                            num_mpus,
+                            self.cache_line_size,
+                            "2GiB",
+                            "2GiB",
+                            graph_path)
 
         mpus = []
         for i in range(num_mpus):
@@ -136,14 +142,15 @@ def __init__(self, num_mpus, graph_path):
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
     args = argparser.parse_args()
-    return args.num_mpus, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
 
 if __name__ == "__m5_main__":
-    num_mpus, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7b00f8b12d..bd2f6320a8 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -34,6 +34,9 @@ class CenteralController(ClockedObject):
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
+    system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
     addr = Param.Addr("")
     value = Param.Int(0, "")
+
+    image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 41ebeb9cd6..3c05972224 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include "base/loader/memory_image.hh"
+#include "base/loader/object_file.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -36,6 +39,7 @@ namespace gem5
 CenteralController::CenteralController
                     (const CenteralControllerParams &params):
     ClockedObject(params),
+    system(params.system),
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
@@ -51,6 +55,26 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+CenteralController::initState()
+{
+    ClockedObject::initState();
+
+    const auto &file = params().image_file;
+    if (file == "")
+        return;
+
+    auto *object = loader::createObjectFile(file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), file);
+
+    loader::debugSymbolTable.insert(*object->symtab().globals());
+    loader::MemoryImage image = object->buildImage();
+    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
+                    system->cacheLineSize());
+
+    panic_if(!image.write(proxy), "%s: Unable to write image.");
+}
+
 void
 CenteralController::startup()
 {
@@ -110,4 +134,12 @@ CenteralController::ReqPort::recvReqRetry()
     }
 }
 
+void
+CenteralController::functionalAccess(PacketPtr pkt)
+{
+    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+                __func__, pkt->getAddr(), pkt->getSize());
+    reqPort.sendFunctional(pkt);
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 0e1bb6ac80..102800de92 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -61,16 +61,20 @@ class CenteralController : public ClockedObject
         virtual void recvReqRetry();
     };
 
+    System* system;
     ReqPort reqPort;
 
     Addr addr;
     uint32_t value;
 
-    template<typename T> PacketPtr 
+    template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
 
+    virtual void initState();
     virtual void startup();
 
+    void functionalAccess(PacketPtr pkt);
+
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);

From f0bf6143f964c3ddbd5197d1d77efee8fe0381e8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Apr 2022 15:28:06 -0700
Subject: [PATCH 098/287] Changing debug flag for CenteralController.

---
 src/accl/graph/sega/SConscript             | 1 +
 src/accl/graph/sega/centeral_controller.cc | 5 +++--
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index c8810bbdb2..16fab86ede 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -38,3 +38,4 @@ Source('push_engine.cc')
 Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
+DebugFlag('CenteralController')
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 3c05972224..f19c93ebac 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -30,7 +30,7 @@
 
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
-#include "debug/MPU.hh"
+#include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -137,7 +137,8 @@ CenteralController::ReqPort::recvReqRetry()
 void
 CenteralController::functionalAccess(PacketPtr pkt)
 {
-    DPRINTF(MPU, "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
+    DPRINTF(CenteralController,
+                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
                 __func__, pkt->getAddr(), pkt->getSize());
     reqPort.sendFunctional(pkt);
 }

From 4485e3b2b981fc620daabd7470d8bc8d9adcf978 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 3 May 2022 09:33:52 -0700
Subject: [PATCH 099/287] Fixing a bug and adding new stats.

---
 configs/accl/sega.py                   |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  4 +++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 19 ++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh     | 13 +++++++++++++
 5 files changed, 41 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0907ba77de..bfdad58f72 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,13 +9,15 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32)
+                                    attached_memory_atom_size=32,
+                                    cache_size="1MiB",
+                                    num_mshr_entry=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
+                                update_queue_size=32,
                                 on_the_fly_update_map_size=8)
 
     def getRespPort(self):
@@ -113,6 +115,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = vertex_cache_line_size
+        self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e6503ea01d..fbe593507a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -199,7 +199,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
             if (aligned_addr != cacheBlocks[block_index].addr) {
                 stats.readMisses++;
             } else {
-                stats.readHits++;
+                stats.readHitUnderMisses++;
             }
 
             MSHRMap[block_index].push_back(addr);
@@ -538,6 +538,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hits."),
     ADD_STAT(readMisses, statistics::units::Count::get(),
              "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections.")
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index efd19d3e9b..ce019ef969 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar numVertexWrites;
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
+      statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
     };
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e822b7168b..69b9f3f23e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,8 @@ PushEngine::PushEngine(const PushEngineParams &params):
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name())
+    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    stats(*this)
 {}
 
 Port&
@@ -207,6 +208,7 @@ PushEngine::processNextPushEvent()
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
+        stats.numUpdates++;
         DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
@@ -247,4 +249,19 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PushEngine::PushStats::PushStats(PushEngine &_push)
+    : statistics::Group(&_push),
+    push(_push),
+
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of sent updates.")
+{
+}
+
+void
+PushEngine::PushStats::regStats()
+{
+    using namespace statistics;
+}
+
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce9045e91a..7a6981daa0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -120,6 +120,19 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    struct PushStats : public statistics::Group
+    {
+      PushStats(PushEngine &push);
+
+      void regStats() override;
+
+      PushEngine &push;
+
+      statistics::Scalar numUpdates;
+    };
+
+    PushStats stats;
+
   protected:
     virtual void respondToAlarm();
     virtual bool handleMemResp(PacketPtr pkt);

From c17fb8b04a02fdd590aa3ea5df55cedef47b1f18 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 17 May 2022 10:56:09 -0700
Subject: [PATCH 100/287] Fixing double evicts.

---
 configs/accl/sega.py                   |  6 +++---
 src/accl/graph/sega/coalesce_engine.cc | 27 ++++++++++----------------
 src/accl/graph/sega/coalesce_engine.hh |  3 ---
 3 files changed, 13 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index bfdad58f72..b799b05dc5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=192, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -130,7 +130,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                             num_mpus,
                             self.cache_line_size,
                             "2GiB",
-                            "2GiB",
+                            "14GiB",
                             graph_path)
 
         mpus = []
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate()
+    exit_event = m5.simulate(1000000000000)
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fbe593507a..b41f6b1db7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -325,22 +325,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-PacketPtr
-CoalesceEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
@@ -370,7 +354,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+        }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce019ef969..e86014fc25 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -85,9 +85,6 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<int> evictQueue;
 
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-    // PacketPtr createWritePacket(Addr addr, unsigned int size, WorkListItem wl);
-
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 

From 4c8ebec475ae4473c8819f59cc3c09804613d7bc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 18 May 2022 17:23:05 -0700
Subject: [PATCH 101/287] Fixing false dependency and deadlock issues. wip.

---
 src/accl/graph/sega/coalesce_engine.cc | 74 +++++++++++++++++++++++++-
 1 file changed, 72 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b41f6b1db7..92d82bce35 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -347,9 +347,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+
     // TODO: Make this more general and programmable.
-    // TODO: Later on check (cacheBlocks[block_index].hasConflict) to add
-    // to evictQueue.
     if ((cacheBlocks[block_index].takenMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
@@ -359,6 +358,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         for (auto i : evictQueue) {
             if (i == block_index) {
                 found = true;
+                break;
             }
         }
         if (!found) {
@@ -376,6 +376,76 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
 }
 
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    int block_index = applyQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseApplySchedules++;
+    } else if (!cacheBlocks[block_index].hasChange) {
+        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+                    "needed. Adding the cache line to evict schedule.\n",
+                    __func__, block_index);
+        evictQueue.push_back(block_index);
+    } else {
+        for (int i = 0; i < numElementsPerLine; i++) {
+            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
+            cacheBlocks[block_index].items[i].prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            // TODO: Is this correct?
+            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
+
+            if (cacheBlocks[block_index].items[i].prop != old_prop) {
+                if (peerPushEngine->recvWLItem(
+                    cacheBlocks[block_index].items[i])) {
+                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+                } else {
+                    // peerPushEngine->setPushAlarm();
+                    // pendingPushAlarm = true;
+                    return;
+                }
+            }
+        }
+        // TODO: This is where eviction policy goes
+        evictQueue.push_back(block_index);
+    }
+
+    applyQueue.pop_front();
+
+    if ((!evictQueue.empty()) &&
+        (!pendingAlarm()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
+    }
+
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextEvictEvent()
+{
+    int block_index = evictQueue.front();
+
+    if (cacheBlocks[block_index].takenMask) {
+        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+                    "Therefore, ignoring the apply schedule.\n",
+                    __func__, block_index);
+        stats.falseEvictSchedules++;
+    } else {
+        int space_needed = cacheBlocks
+    }
+}
+
 void
 CoalesceEngine::processNextApplyAndCommitEvent()
 {

From 7e7f09d79330b2de27c62d3d07e7bf141c20ccd3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 12:10:10 -0700
Subject: [PATCH 102/287] Decoupling apply and evict. Done.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 214 ++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  11 +-
 3 files changed, 81 insertions(+), 146 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b799b05dc5..9d8b449e0f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -122,7 +122,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=192, value=0,
+        self.ctrl = CenteralController(addr=0, value=0,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92d82bce35..f3402255bc 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,7 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyAndCommitEvent([this] { processNextApplyAndCommitEvent(); }, name()),
+    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
+    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -237,8 +238,8 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToAlarm()
 {
-    assert(!nextApplyAndCommitEvent.scheduled());
-    schedule(nextApplyAndCommitEvent, nextCycle());
+    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    schedule(nextEvictEvent, nextCycle());
 }
 
 bool
@@ -362,16 +363,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             }
         }
         if (!found) {
-            evictQueue.push_back(block_index);
+            applyQueue.push_back(block_index);
         }
         DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
     }
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty()) &&
-        (!pendingAlarm())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!applyQueue.empty()) &&
+        (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 
 }
@@ -442,150 +442,74 @@ CoalesceEngine::processNextEvictEvent()
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks
-    }
-}
-
-void
-CoalesceEngine::processNextApplyAndCommitEvent()
-{
-    // FIXME: Refactor the line below to work with the new inheritance.
-    // assert((!alarmRequested) && (spaceRequested == 0));
-    int block_index = evictQueue.front();
-    uint8_t changedMask = 0;
-
-    DPRINTF(MPU, "%s: Received nextApplyAndCommitEvent for cache line[%d].\n",
-                __func__, block_index);
-    DPRINTF(MPU, "%s: Checking to see if cache line[%d] could be applied and "
-                "then commited.\n", __func__, block_index);
-
-    if (cacheBlocks[block_index].takenMask == 0) {
-        if ((cacheBlocks[block_index].hasChange) &&
-            (cacheBlocks[block_index].hasConflict) &&
-            (memReqQueueHasSpace(2))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (cacheBlocks[block_index].hasConflict) &&
-                    (memReqQueueHasSpace(1))) {
-            DPRINTF(MPU, "%s: ApplyAndCommit could be done on cache line[%d].\n",
-                        __func__, block_index);
-        } else if ((!cacheBlocks[block_index].hasChange) &&
-                    (!cacheBlocks[block_index].hasConflict)) {
-            DPRINTF(MPU, "%s: No ApplyAndCommit needed for cache line[%d].\n",
-                        __func__, block_index);
-        } else {
-            int spaceNeeded = cacheBlocks[block_index].hasConflict ? 2 : 1;
-            requestAlarm(spaceNeeded);
-            DPRINTF(MPU, "%s: Not enough space in outstandingMemReqQueue. Set "
-            "an alarm for nextApplyAndCommitEvent when there is %d space.\n",
-            __func__, spaceNeeded);
+        int space_needed = cacheBlocks[block_index].hasChange ?
+                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
+                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
+        if (!memReqQueueHasSpace(space_needed)) {
+            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "hasConflict: %d.\n", __func__, block_index,
+                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].hasConflict);
+            requestAlarm(space_needed);
             return;
-        }
-
-        // Reducing between tempProp and prop for each item in the cache line.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
-                cacheBlocks[block_index].items[i].prop,
-                cacheBlocks[block_index].items[i].tempProp);
-            DPRINTF(MPU, "%s: Applied cache line[%d][%d] = %s.\n", __func__,
-                        block_index, i,
-                        cacheBlocks[block_index].items[i].to_string());
-            if (old_prop != cacheBlocks[block_index].items[i].prop) {
-                changedMask |= (1 << i);
-                // TODO: Add a stat to count the number of changed props.
-                DPRINTF(MPU, "%s: Change observed in cache line[%d][%d].\n",
-                            __func__, block_index, i);
+        } else {
+            if (cacheBlocks[block_index].hasChange) {
+                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                            __func__, block_index);
+                PacketPtr write_pkt = createWritePacket(
+                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                    (uint8_t*) cacheBlocks[block_index].items);
+                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                            "size = %d.\n", __func__,
+                            write_pkt->getAddr(), write_pkt->getSize());
+                enqueueMemReq(write_pkt);
             }
-        }
 
-        if (cacheBlocks[block_index].hasChange) {
-            DPRINTF(MPU, "%s: At least one item from cache line[%d] has changed.\n"
-                        , __func__, block_index);
-
-            PacketPtr write_pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, size = %d.\n",
-                        __func__, write_pkt->getAddr(), peerMemoryAtomSize);
-            enqueueMemReq(write_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet to "
-                        "outstandingMemReqQueue.\n" , __func__);
-
-            for (int i = 0; i < numElementsPerLine; i++) {
-                if ((changedMask & (1 << i)) == (1 << i)) {
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu] = %s.\n",
-                    __func__, cacheBlocks[block_index].addr + (i * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent cache line[%d][%d] to PushEngine.\n",
-                                                        __func__, block_index, i);
-                }
-            }
-        }
+            if (cacheBlocks[block_index].hasConflict) {
+                assert(!MSHRMap[block_index].empty());
+                Addr miss_addr = MSHRMap[block_index].front();
+                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
-        if (cacheBlocks[block_index].hasConflict) {
-            assert(!MSHRMap[block_index].empty());
-            DPRINTF(MPU, "%s: A conflict exists for cache line[%d]. There is "
-                        "enough space in outstandingMemReqQueue for a read "
-                        "packet.\n", __func__, block_index);
-            Addr miss_addr = MSHRMap[block_index][0];
-            DPRINTF(MPU, "%s: First conflicting address for cache line[%d] is"
-                        " Addr: %lu.\n", __func__, block_index, miss_addr);
-
-            Addr aligned_miss_addr =
-                std::floor(miss_addr / peerMemoryAtomSize) *
+                Addr aligned_miss_addr =
+                    std::floor(miss_addr / peerMemoryAtomSize) *
                     peerMemoryAtomSize;
-            PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                    peerMemoryAtomSize);
-            DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
-                        " req addr (aligned_addr) = %lu, size = %d.\n",
-                        __func__, miss_addr, aligned_miss_addr, peerMemoryAtomSize);
-            enqueueMemReq(read_pkt);
-            DPRINTF(MPU, "%s: Added the evicting write back packet along with "
-                        "its subsequent read packet (to service the conflicts)"
-                        " to outstandingMemReqQueue.\n" , __func__);
-
-            cacheBlocks[block_index].addr = aligned_miss_addr;
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = true;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = true;
-            cacheBlocks[block_index].hasChange = false;
-        } else {
-            DPRINTF(MPU, "%s: No conflict exists for cache line[%d]. There is "
-                    "enough space in outstandingMemReqQueue for the write back"
-                    " packet.\n", __func__, block_index);
-            DPRINTF(MPU, "%s: Added the write back packet to "
-                        "outstandingMemReqQueue.\n", __func__);
-
-            // Since allocated is false, does not matter what the address is.
-            cacheBlocks[block_index].takenMask = 0;
-            cacheBlocks[block_index].allocated = false;
-            cacheBlocks[block_index].valid = false;
-            cacheBlocks[block_index].hasConflict = false;
-            cacheBlocks[block_index].hasChange = false;
-        }
+                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
+                                                        peerMemoryAtomSize);
+                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                            " req addr (aligned_addr) = %lu, size = %d.\n",
+                            __func__, miss_addr,
+                            read_pkt->getAddr(), read_pkt->getSize());
+                enqueueMemReq(read_pkt);
+
+                cacheBlocks[block_index].addr = aligned_miss_addr;
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = true;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = true;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                            __func__, block_index, aligned_miss_addr);
+            } else {
 
-    } else {
-        DPRINTF(MPU, "%s: cache line[%d] has been read since being scheduled "
-                    "for eviction. Therefore, ignoring the evict schedule.\n",
-                    __func__, block_index);
+                // Since allocated is false, does not matter what the address is.
+                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].allocated = false;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].hasChange = false;
+                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                            __func__, block_index);
+            }
+        }
     }
 
     evictQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped an item from evictQueue. evictQueue.size "
-                        " = %u.\n", __func__, evictQueue.size());
 
-    if ((!nextApplyAndCommitEvent.scheduled()) &&
-        (!evictQueue.empty())) {
-        schedule(nextApplyAndCommitEvent, nextCycle());
+    if ((!evictQueue.empty()) &&
+        (!nextEvictEvent.scheduled())) {
+        schedule(nextEvictEvent, nextCycle());
     }
 }
 
@@ -604,7 +528,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections.")
+             "Number of cache rejections."),
+    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
+             "Number of failed apply schedules."),
+    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
+             "Number of failed evict schedules.")
 {
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e86014fc25..82b03f53aa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -83,13 +83,18 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    std::deque<int> applyQueue;
+
     std::deque<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
-    EventFunctionWrapper nextApplyAndCommitEvent;
-    void processNextApplyAndCommitEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
+
+    EventFunctionWrapper nextEvictEvent;
+    void processNextEvictEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -105,6 +110,8 @@ class CoalesceEngine : public BaseMemEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
+      statistics::Scalar falseApplySchedules;
+      statistics::Scalar falseEvictSchedules;
     };
 
     CoalesceStats stats;

From 550a9fed64190cb41db8366425e3b793c8c5ada8 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 19 May 2022 21:20:07 -0700
Subject: [PATCH 103/287] Fixed miss-deallocation bug. Hopefully.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  24 +++---
 src/accl/graph/base/base_mem_engine.hh |  17 ++--
 src/accl/graph/sega/coalesce_engine.cc | 107 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  13 +--
 src/accl/graph/sega/push_engine.cc     |  26 ++++--
 src/accl/graph/sega/push_engine.hh     |  11 ++-
 src/accl/graph/sega/wl_engine.cc       |   1 -
 src/accl/graph/sega/wl_engine.hh       |   1 -
 9 files changed, 136 insertions(+), 66 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 9d8b449e0f..31b65ae726 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 50e64ae7c3..f02f1d2feb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -37,8 +37,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
-    alarmRequested(false),
-    spaceRequested(0),
+    memAlarmRequested(false),
+    memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
@@ -106,12 +106,12 @@ BaseMemEngine::processNextMemReqEvent()
                 __func__, pkt->getAddr(), pkt->getSize());
     outstandingMemReqQueue.pop_front();
 
-    if (alarmRequested &&
+    if (memAlarmRequested &&
         (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - spaceRequested))) {
-        alarmRequested = false;
-        spaceRequested = 0;
-        respondToAlarm();
+        (outstandingMemReqQueueSize - memSpaceRequested))) {
+        memAlarmRequested = false;
+        memSpaceRequested = 0;
+        respondToMemAlarm();
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -151,7 +151,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::memReqQueueHasSpace(int space)
+BaseMemEngine::allocateMemReqSpace(int space)
 {
     assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
     return (
@@ -179,13 +179,13 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestAlarm(int space) {
-    panic_if((alarmRequested == true) || (spaceRequested != 0),
+BaseMemEngine::requestMemAlarm(int space) {
+    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    alarmRequested = true;
-    spaceRequested = space;
+    memAlarmRequested = true;
+    memSpaceRequested = space;
 }
 
 void
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fb7cab91b0..8a18807e2e 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,8 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
-    bool alarmRequested;
-    int spaceRequested;
+    bool memAlarmRequested;
+    int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
     EventFunctionWrapper nextMemReqEvent;
@@ -81,15 +81,16 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-
-    bool memReqQueueHasSpace(int space);
+    bool allocateMemReqSpace(int space);
     bool memReqQueueFull();
+
+    bool pendingMemAlarm() { return memAlarmRequested; }
+    void requestMemAlarm(int space);
+
+    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
-    bool pendingAlarm() { return alarmRequested; }
-    void requestAlarm(int space);
 
-    virtual void respondToAlarm() = 0;
+    virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f3402255bc..36faff2c6a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,6 +44,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    pendingPushAlarm(false),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -54,6 +55,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    peerPushEngine->registerCoalesceEngine(this);
 }
 
 void
@@ -91,10 +93,11 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset, responseQueue.size(),
-            cacheBlocks[block_index].items[wl_offset].to_string());
+            __func__, addr, block_index, wl_offset,
+            cacheBlocks[block_index].items[wl_offset].to_string(),
+            responseQueue.size());
         // TODO: Add a stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
         assert(!responseQueue.empty());
@@ -156,7 +159,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                         return false;
                     }
                     cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].takenMask = 0;
+                    cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
@@ -236,9 +239,9 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToAlarm()
+CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingAlarm() && (!nextEvictEvent.scheduled()));
+    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
     schedule(nextEvictEvent, nextCycle());
 }
 
@@ -290,7 +293,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].takenMask |= (1 << wl_offset);
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
 
             servicedIndices.push_back(i);
@@ -336,27 +339,27 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
-    assert((cacheBlocks[block_index].takenMask & (1 << wl_offset)) ==
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].hasChange = true;
+        cacheBlocks[block_index].dirty = true;
         stats.numVertexWrites++;
     }
 
     cacheBlocks[block_index].items[wl_offset] = wl;
-    cacheBlocks[block_index].takenMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].takenMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
         bool found = false;
-        for (auto i : evictQueue) {
+        for (auto i : applyQueue) {
             if (i == block_index) {
                 found = true;
                 break;
@@ -364,12 +367,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
         if (!found) {
             applyQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                    __func__, block_index, applyQueue.size());
         }
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
+        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -381,16 +385,27 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    if (cacheBlocks[block_index].busyMask) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].hasChange) {
+    } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        evictQueue.push_back(block_index);
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -407,20 +422,32 @@ CoalesceEngine::processNextApplyEvent()
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
                 } else {
-                    // peerPushEngine->setPushAlarm();
-                    // pendingPushAlarm = true;
+                    peerPushEngine->setPushAlarm();
+                    pendingPushAlarm = true;
                     return;
                 }
             }
         }
         // TODO: This is where eviction policy goes
-        evictQueue.push_back(block_index);
+        // TODO: Fix this hack.
+        bool found = false;
+        for (auto i : evictQueue) {
+            if (i == block_index) {
+                found = true;
+                break;
+            }
+        }
+        if (!found) {
+            evictQueue.push_back(block_index);
+            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                    __func__, block_index, evictQueue.size());
+        }
     }
 
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingAlarm()) &&
+        (!pendingMemAlarm()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -436,25 +463,33 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    if (cacheBlocks[block_index].takenMask) {
+    bool found_in_apply_queue = false;
+    for (auto i : applyQueue) {
+        if (i == block_index) {
+            found_in_apply_queue = true;
+            break;
+        }
+    }
+    if ((cacheBlocks[block_index].busyMask) ||
+        (found_in_apply_queue)) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].hasChange ?
+        int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!memReqQueueHasSpace(space_needed)) {
+        if (!allocateMemReqSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. hasChange: %d, "
+                    "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].hasChange,
+                    cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestAlarm(space_needed);
+            requestMemAlarm(space_needed);
             return;
         } else {
-            if (cacheBlocks[block_index].hasChange) {
+            if (cacheBlocks[block_index].dirty) {
                 DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
@@ -484,21 +519,21 @@ CoalesceEngine::processNextEvictEvent()
                 enqueueMemReq(read_pkt);
 
                 cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = true;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
                 // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].takenMask = 0;
+                cacheBlocks[block_index].busyMask = 0;
                 cacheBlocks[block_index].allocated = false;
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].hasChange = false;
+                cacheBlocks[block_index].dirty = false;
                 DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
@@ -513,6 +548,14 @@ CoalesceEngine::processNextEvictEvent()
     }
 }
 
+void
+CoalesceEngine::respondToPushAlarm()
+{
+    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
+    pendingPushAlarm = false;
+    schedule(nextApplyEvent, nextCycle());
+}
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 82b03f53aa..824faef10d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,21 +49,21 @@ class CoalesceEngine : public BaseMemEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t takenMask;
+        uint8_t busyMask;
         bool allocated;
         bool valid;
         bool hasConflict;
-        bool hasChange;
+        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
-          takenMask(0),
+          busyMask(0),
           allocated(false),
           valid(false),
           hasConflict(false),
-          hasChange(false)
+          dirty(false)
         {
           items = new WorkListItem [num_elements];
         }
@@ -83,6 +83,7 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    bool pendingPushAlarm;
     std::deque<int> applyQueue;
 
     std::deque<int> evictQueue;
@@ -117,7 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -131,6 +132,8 @@ class CoalesceEngine : public BaseMemEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
+
+    void respondToPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 69b9f3f23e..d5563cca7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
+#include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -36,6 +37,7 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
+    pushAlarmSet(false),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
@@ -56,6 +58,12 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+{
+    peerCoalesceEngine = coalesce_engine;
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -146,11 +154,15 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
+            pushAlarmSet = false;
+            peerCoalesceEngine->respondToPushAlarm();
+        }
     }
 
     if (memReqQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestAlarm(1);
+            requestMemAlarm(1);
         }
         return;
     }
@@ -161,7 +173,7 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToAlarm()
+PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
@@ -200,9 +212,6 @@ PushEngine::processNextPushEvent()
 
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
-    DPRINTF(MPU, "%s: Sending an update to %lu with value: %d.\n",
-            __func__, curr_edge->neighbor, update_value);
-
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge->neighbor, update_value);
 
@@ -249,6 +258,13 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+void
+PushEngine::setPushAlarm()
+{
+    assert(!pushAlarmSet);
+    pushAlarmSet = true;
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7a6981daa0..ce24f862ba 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -36,6 +36,8 @@
 namespace gem5
 {
 
+class CoalesceEngine;
+
 class PushEngine : public BaseMemEngine
 {
   private:
@@ -95,6 +97,9 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
+    bool pushAlarmSet;
+    CoalesceEngine* peerCoalesceEngine;
+
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
@@ -134,7 +139,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual void respondToAlarm();
+    virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,6 +150,10 @@ class PushEngine : public BaseMemEngine
                 PortID idx=InvalidPortID) override;
 
     bool recvWLItem(WorkListItem wl);
+
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+
+    void setPushAlarm();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e949cbcf5b..75ac4f784e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -37,7 +37,6 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams &params):
     BaseReduceEngine(params),
     respPort(name() + ".resp_port", this),
-    blockedByCoalescer(false),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 597fdb2b1e..27fc3efa7a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -69,7 +69,6 @@ class WLEngine : public BaseReduceEngine
 
     RespPort respPort;
 
-    bool blockedByCoalescer;
     CoalesceEngine* coalesceEngine;
 
     int updateQueueSize;

From 929aab118886fde9e286876fd2dc997be0a8684c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 14:15:30 -0700
Subject: [PATCH 104/287] Correctness passed with finite push queue and
 facebook graph.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 20 +++++++++++++-------
 src/accl/graph/sega/push_engine.cc     | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh     |  3 ++-
 4 files changed, 22 insertions(+), 16 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 31b65ae726..8a6ac783c3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -158,6 +158,6 @@ def get_inputs():
 
     m5.instantiate()
 
-    exit_event = m5.simulate(1000000000000)
+    exit_event = m5.simulate()
     print("Simulation finished!")
     exit()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 36faff2c6a..39144972df 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -349,7 +349,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d] = %s.\n", __func__, block_index,
+    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
@@ -409,15 +410,20 @@ CoalesceEngine::processNextApplyEvent()
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            cacheBlocks[block_index].items[i].prop = std::min(
+            uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            // TODO: Is this correct?
-            cacheBlocks[block_index].items[i].tempProp = cacheBlocks[block_index].items[i].prop;
 
-            if (cacheBlocks[block_index].items[i].prop != old_prop) {
-                if (peerPushEngine->recvWLItem(
-                    cacheBlocks[block_index].items[i])) {
+            if (new_prop != old_prop) {
+                if (peerPushEngine->allocatePushSpace()) {
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
+                    __func__,
+                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
+                    cacheBlocks[block_index].items[i].to_string());
+                    peerPushEngine->recvWLItem(
+                                        cacheBlocks[block_index].items[i]);
                     DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
                     __func__,
                     cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5563cca7c..8cfe3c72cc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,7 +97,7 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
-bool
+void
 PushEngine::recvWLItem(WorkListItem wl)
 {
     // If there are no outdoing edges, no need to generate and push
@@ -105,14 +105,14 @@ PushEngine::recvWLItem(WorkListItem wl)
     if (wl.degree == 0) {
         DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
                     __func__, wl.to_string());
-        return true;
+        return;
     }
 
     assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() <= pushReqQueueSize));
-    if ((pushReqQueueSize != 0) && (pushReqQueue.size() == pushReqQueueSize)) {
-        return false;
-    }
+        (pushReqQueue.size() < pushReqQueueSize));
+    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
+                "method after checking if there is enough push space. Use "
+                "allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
@@ -125,7 +125,6 @@ PushEngine::recvWLItem(WorkListItem wl)
         (!memReqQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
-    return true;
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ce24f862ba..ae465f6eb1 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -149,7 +149,8 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool recvWLItem(WorkListItem wl);
+    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    void recvWLItem(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
 

From e16c0deadb328f6496d9f424a21cd3677a5ce542 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 22 May 2022 17:49:06 -0700
Subject: [PATCH 105/287] Fixing an incorrect assertion.

---
 configs/accl/sega.py                   | 23 +++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |  2 +-
 src/accl/graph/sega/push_engine.cc     |  1 -
 3 files changed, 18 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8a6ac783c3..11e2cfb6af 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=64,
                                     attached_memory_atom_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
@@ -109,7 +109,12 @@ def setEdgePort(self, port, i):
         self.edge_mem_ctrl[i].port = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
+    def __init__(self,
+                num_mpus,
+                vertex_cache_line_size,
+                graph_path,
+                first_addr,
+                first_value):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -122,7 +127,7 @@ def __init__(self, num_mpus, vertex_cache_line_size, graph_path):
                                             response_latency=1,
                                             width=64)
 
-        self.ctrl = CenteralController(addr=0, value=0,
+        self.ctrl = CenteralController(addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
@@ -147,13 +152,19 @@ def get_inputs():
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, args.graph_path
+    return args.num_mpus, args.vertex_cache_line_size, \
+            args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, graph_path = get_inputs()
+    num_mpus, vertex_cache_line_size, \
+        graph_path, first_addr, first_value = get_inputs()
+
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, graph_path)
+    system = SEGA(num_mpus, vertex_cache_line_size, \
+                graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 39144972df..dd651f9e5a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -241,7 +241,7 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::respondToMemAlarm()
 {
-    assert(pendingMemAlarm() && (!nextEvictEvent.scheduled()));
+    assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8cfe3c72cc..ed23fb4d4b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -176,7 +176,6 @@ PushEngine::respondToMemAlarm()
 {
     assert(!nextAddrGenEvent.scheduled());
     schedule(nextAddrGenEvent, nextCycle());
-    DPRINTF(MPU, "%s: Responded to an alarm.\n", __func__);
 }
 
 bool

From 83af4b3b2720bdb7d0ab3b836c4f0c2516b1a950 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 3 Jun 2022 07:44:25 -0700
Subject: [PATCH 106/287] Converting apply and evict queues to FIFOSet.

---
 src/accl/graph/base/data_structs.hh    | 50 +++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc | 68 +++++++++-----------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.hh     |  3 +-
 4 files changed, 76 insertions(+), 49 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 7535d4bbac..e03686a7e9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,6 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
+#include <unordered_set>
+#include <queue>
+
 namespace gem5
 {
 
@@ -83,6 +86,53 @@ struct __attribute__ ((packed)) Edge
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
+static_assert(isPowerOf2(sizeof(Edge)));
+
+template<typename T>
+class FIFOSet
+{
+    private:
+        std::queue<T> fifo;
+        std::unordered_set<T> set;
+
+    public:
+        FIFOSet(int cap)
+        {
+            set.reserve(cap);
+        }
+
+        void push_back(T item)
+        {
+            if (set.find(item) == set.end()) {
+                set.insert(item);
+                fifo.push(item);
+            }
+        }
+
+        void pop_front()
+        {
+            T front = fifo.front();
+            set.erase(front);
+            fifo.pop();
+        }
+
+        T& front()
+        {
+            return fifo.front();
+        }
+
+        size_t size() {
+            return fifo.size();
+        }
+
+        bool empty() {
+            return fifo.empty();
+        }
+
+        bool find(T item) {
+            return (set.find(item) != set.end());
+        }
+};
 
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dd651f9e5a..f96adbf8d8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,6 +45,8 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     pendingPushAlarm(false),
+    applyQueue(numLines),
+    evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -55,6 +57,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+
     peerPushEngine->registerCoalesceEngine(this);
 }
 
@@ -141,14 +144,18 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                 "line[%d]", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
+                    if (!cacheBlocks[block_index].busyMask) {
+                        applyQueue.push_back(block_index);
+                        assert(!applyQueue.empty());
+                        if ((!nextApplyEvent.scheduled()) &&
+                            (!pendingPushAlarm)) {
+                            schedule(nextApplyEvent, nextCycle());
+                        }
+                    }
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    //TODO: Fix this to work with new inheritance.
-                    // assert(
-                    //     outstandingMemReqQueue.size() <=
-                    //     outstandingMemReqQueueSize);
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
@@ -278,8 +285,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHRMap[block_index].size(); i++) {
         Addr miss_addr = MSHRMap[block_index][i];
-        Addr aligned_miss_addr = std::floor(miss_addr / peerMemoryAtomSize) * peerMemoryAtomSize;
-
+        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
@@ -333,7 +339,7 @@ void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = std::floor(addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
@@ -359,18 +365,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         // TODO: Fix this hack
-        bool found = false;
-        for (auto i : applyQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
-            applyQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                    __func__, block_index, applyQueue.size());
-        }
+        applyQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+                __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
@@ -395,15 +392,9 @@ CoalesceEngine::processNextApplyEvent()
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed. Adding the cache line to evict schedule.\n",
                     __func__, block_index);
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict) {
             evictQueue.push_back(block_index);
+            assert(!evictQueue.empty());
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
         }
@@ -435,15 +426,7 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
         // TODO: This is where eviction policy goes
-        // TODO: Fix this hack.
-        bool found = false;
-        for (auto i : evictQueue) {
-            if (i == block_index) {
-                found = true;
-                break;
-            }
-        }
-        if (!found) {
+        if (cacheBlocks[block_index].hasConflict){
             evictQueue.push_back(block_index);
             DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                     __func__, block_index, evictQueue.size());
@@ -469,15 +452,8 @@ CoalesceEngine::processNextEvictEvent()
 {
     int block_index = evictQueue.front();
 
-    bool found_in_apply_queue = false;
-    for (auto i : applyQueue) {
-        if (i == block_index) {
-            found_in_apply_queue = true;
-            break;
-        }
-    }
     if ((cacheBlocks[block_index].busyMask) ||
-        (found_in_apply_queue)) {
+        (applyQueue.find(block_index))) {
         DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
@@ -514,8 +490,8 @@ CoalesceEngine::processNextEvictEvent()
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
-                    std::floor(miss_addr / peerMemoryAtomSize) *
-                    peerMemoryAtomSize;
+                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
                 DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 824faef10d..177bb067ab 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -84,9 +84,9 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     bool pendingPushAlarm;
-    std::deque<int> applyQueue;
+    FIFOSet<int> applyQueue;
 
-    std::deque<int> evictQueue;
+    FIFOSet<int> evictQueue;
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ae465f6eb1..c93b3b386d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -31,6 +31,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
 namespace gem5
@@ -59,7 +60,7 @@ class PushEngine : public BaseMemEngine
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
-            Addr aligned_addr = std::floor(_start / _atom) * _atom;
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
             Addr offset = _start - aligned_addr;
             int num_items = 0;
 

From e9c4b2e982425c29d348780c5d819a8b7893f377 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 13 Jun 2022 14:48:49 -0700
Subject: [PATCH 107/287] Moving delete pkt in push_engine.cc.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ed23fb4d4b..cb71b73c60 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -228,8 +228,8 @@ PushEngine::processNextPushEvent()
         reqOffsetMap.erase(pkt->req);
         reqNumEdgeMap.erase(pkt->req);
         reqValueMap.erase(pkt->req);
-        delete pkt;
         memRespQueue.pop_front();
+        delete pkt;
     }
 
     if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {

From a07fba27ea6d0869853fe4db500680e4c62aeb9f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 19 Jun 2022 14:29:57 -0700
Subject: [PATCH 108/287] Enforced limited length on memRespQueue in
 PushEngine.

---
 configs/accl/sega.py                   | 15 +++++---
 src/accl/graph/SConscript              |  3 +-
 src/accl/graph/base/BaseMemEngine.py   |  2 ++
 src/accl/graph/base/base_mem_engine.cc | 49 +++++++++++++++++---------
 src/accl/graph/base/base_mem_engine.hh |  4 +++
 src/accl/graph/sega/coalesce_engine.cc |  5 ++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     |  5 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       | 12 +++++--
 10 files changed, 72 insertions(+), 25 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 11e2cfb6af..a5dd759f1f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,16 +9,21 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=64,
-                                    attached_memory_atom_size=64)
+                                    push_req_queue_size=1,
+                                    attached_memory_atom_size=64,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=16)
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=32,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 00fa2466dd..9663d3f263 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,4 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
\ No newline at end of file
+DebugFlag('MPU')
+DebugFlag('SEGAQSize')
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
index 69f68e9dfc..2ecb6659d8 100644
--- a/src/accl/graph/base/BaseMemEngine.py
+++ b/src/accl/graph/base/BaseMemEngine.py
@@ -43,3 +43,5 @@ class BaseMemEngine(ClockedObject):
 
     attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
                                     "memory.")
+
+    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index f02f1d2feb..112b0d63cb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,8 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
+
 namespace gem5
 {
 
@@ -37,6 +39,8 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     system(params.system),
     memPort(name() + ".mem_port", this),
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    onTheFlyReqs(0),
+    respQueueSize(params.resp_queue_size),
     memAlarmRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
@@ -73,7 +77,7 @@ bool
 BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
 {
     //TODO: Investigate sending true all the time
-    return owner->handleMemResp(pkt);
+    return owner->recvTimingResp(pkt);
 }
 
 void
@@ -98,20 +102,25 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    // TODO: Maybe add a DPRINTF here.
-    PacketPtr pkt = outstandingMemReqQueue.front();
-    memPort.sendPacket(pkt);
-    DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
-                "pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    outstandingMemReqQueue.pop_front();
-
-    if (memAlarmRequested &&
-        (outstandingMemReqQueue.size() <=
-        (outstandingMemReqQueueSize - memSpaceRequested))) {
-        memAlarmRequested = false;
-        memSpaceRequested = 0;
-        respondToMemAlarm();
+    if ((respBuffSize() == -1) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+        PacketPtr pkt = outstandingMemReqQueue.front();
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+                    "pkt->addr: %lu, pkt->size: %lu.\n",
+                    __func__, pkt->getAddr(), pkt->getSize());
+        outstandingMemReqQueue.pop_front();
+        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
+
+        if (memAlarmRequested &&
+            (outstandingMemReqQueue.size() <=
+            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            memAlarmRequested = false;
+            memSpaceRequested = 0;
+            respondToMemAlarm();
+        }
     }
 
     if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
@@ -171,7 +180,8 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-
+    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
+                    __func__, outstandingMemReqQueue.size());
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
@@ -197,4 +207,11 @@ BaseMemEngine::wakeUp()
     }
 }
 
+bool
+BaseMemEngine::recvTimingResp(PacketPtr pkt)
+{
+    onTheFlyReqs--;
+    return handleMemResp(pkt);
+}
+
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 8a18807e2e..fc67f3f6d8 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -69,6 +69,8 @@ class BaseMemEngine : public ClockedObject
     MemPort memPort;
 
     int outstandingMemReqQueueSize;
+    int onTheFlyReqs;
+    int respQueueSize;
     bool memAlarmRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
@@ -90,6 +92,7 @@ class BaseMemEngine : public ClockedObject
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
+    virtual int respBuffSize() = 0;
     virtual void respondToMemAlarm() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
@@ -109,6 +112,7 @@ class BaseMemEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
 
+    bool recvTimingResp(PacketPtr pkt);
     void recvFunctional(PacketPtr pkt);
 
     void wakeUp();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f96adbf8d8..ee1e3f85ff 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -141,11 +141,14 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
                     DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
-                                "line[%d]", __func__, addr, block_index);
+                                "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
+                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                                    "applyQueue.size = %u.\n", __func__,
+                                    block_index, applyQueue.size());
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled()) &&
                             (!pendingPushAlarm)) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 177bb067ab..1e353c11b8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemEngine
     CoalesceStats stats;
 
   protected:
+    virtual int respBuffSize() { return -1; }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cb71b73c60..a045bbdead 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -93,6 +93,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                    __func__, _blocked ? "true" : "false",
+                    (blockedPacket == nullptr) ? "true" : "false");
+
         blockedPacket = nullptr;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c93b3b386d..2c17501d5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -140,6 +140,7 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
+    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void respondToMemAlarm();
     virtual bool handleMemResp(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 75ac4f784e..55a9147ac9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,6 +136,9 @@ WLEngine::processNextReadEvent()
         DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n",
+                        __func__, onTheFlyUpdateMap.size());
             if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
@@ -147,6 +150,10 @@ WLEngine::processNextReadEvent()
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
+        } else {
+            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
+                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
+                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -209,8 +216,9 @@ WLEngine::processNextReduceEvent()
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
+                    "onTheFlyUpdateMap.size: %lu.\n",
+                    __func__, addr, onTheFlyUpdateMap.size());
     }
     addrWorkListMap.clear();
 }

From dd056de8c00f33db13d14350910c5de8d6908c19 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 8 Jul 2022 10:36:46 -0700
Subject: [PATCH 109/287] Adding bit vector implementation for caching push
 meta data.

---
 configs/accl/sega.py                   |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  10 +-
 src/accl/graph/base/data_structs.hh    |  86 +++++++++-------
 src/accl/graph/sega/CoalesceEngine.py  |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 137 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  14 ++-
 src/accl/graph/sega/push_engine.cc     |  62 ++++++++---
 src/accl/graph/sega/push_engine.hh     |  12 ++-
 8 files changed, 227 insertions(+), 104 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a5dd759f1f..96408aa185 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=1,
+                                    push_req_queue_size=0,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,8 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=2)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
@@ -77,7 +76,7 @@ def __init__(self,
             )
             edge_mem_ctrl.append(
                 SimpleMemory(range=self._edge_ranges[i],
-                            bandwidth="19.2GB/s",
+                            bandwidth="4.8GB/s",
                             latency="30ns",
                             image_file=f"{graph_path}/edgelist_{i}")
             )
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 112b0d63cb..3086b81fc2 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
 
 namespace gem5
 {
@@ -102,8 +101,8 @@ BaseMemEngine::processNextMemReqEvent()
         return;
     }
 
-    if ((respBuffSize() == -1) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
+    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
+        (respQueueSize == 0)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -111,8 +110,6 @@ BaseMemEngine::processNextMemReqEvent()
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
-        DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
 
         if (memAlarmRequested &&
             (outstandingMemReqQueue.size() <=
@@ -180,8 +177,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
-    DPRINTF(SEGAQSize, "%s: outstandingMemReqQueue.size: %lu.\n",
-                    __func__, outstandingMemReqQueue.size());
+
     assert(!outstandingMemReqQueue.empty());
     if (!nextMemReqEvent.scheduled()) {
         schedule(nextMemReqEvent, nextCycle());
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e03686a7e9..e30d6029cb 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,8 +32,9 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <unordered_set>
+#include <bitset>
 #include <queue>
+#include <unordered_set>
 
 namespace gem5
 {
@@ -91,49 +92,64 @@ static_assert(isPowerOf2(sizeof(Edge)));
 template<typename T>
 class FIFOSet
 {
-    private:
-        std::queue<T> fifo;
-        std::unordered_set<T> set;
-
-    public:
-        FIFOSet(int cap)
-        {
-            set.reserve(cap);
-        }
+  private:
+    std::queue<T> fifo;
+    std::unordered_set<T> set;
 
-        void push_back(T item)
-        {
-            if (set.find(item) == set.end()) {
-                set.insert(item);
-                fifo.push(item);
-            }
-        }
+  public:
+    FIFOSet(int cap)
+    {
+        set.reserve(cap);
+    }
 
-        void pop_front()
-        {
-            T front = fifo.front();
-            set.erase(front);
-            fifo.pop();
+    void push_back(T item)
+    {
+        if (set.find(item) == set.end()) {
+            set.insert(item);
+            fifo.push(item);
         }
+    }
 
-        T& front()
-        {
-            return fifo.front();
-        }
+    void pop_front()
+    {
+        T front = fifo.front();
+        set.erase(front);
+        fifo.pop();
+    }
 
-        size_t size() {
-            return fifo.size();
-        }
+    T& front()
+    {
+        return fifo.front();
+    }
 
-        bool empty() {
-            return fifo.empty();
-        }
+    size_t size() {
+        return fifo.size();
+    }
 
-        bool find(T item) {
-            return (set.find(item) != set.end());
-        }
+    bool empty() {
+        return fifo.empty();
+    }
+
+    bool find(T item) {
+        return (set.find(item) != set.end());
+    }
 };
 
+// template<int SIZE>
+// class BitVector
+// {
+//   private:
+//     int it;
+//     std::bitset<SIZE> bitStore;
+
+//   public:
+//     BitVector(): it(0) { bitStore.reset(); }
+
+//     uint32_t next() {
+
+//     }
+// };
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 086f284950..7667a22c5a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -41,4 +41,7 @@ class CoalesceEngine(BaseMemEngine):
     num_mshr_entry = Param.Int(4, "")
     num_tgts_per_mshr = Param.Int(20, "")
 
+    # Don't change. If changed. It will break functionality of coalesce.
+    resp_queue_size = 0
+
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ee1e3f85ff..b5eeae694e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -44,7 +44,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    pendingPushAlarm(false),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
@@ -58,7 +57,9 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
 
-    peerPushEngine->registerCoalesceEngine(this);
+    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
+
+    needsApply.reset();
 }
 
 void
@@ -67,6 +68,38 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     sendMemFunctional(pkt);
 }
 
+void
+CoalesceEngine::startup()
+{
+    AddrRangeList vertex_ranges = getAddrRanges();
+
+    bool found = false;
+    Addr first_match_addr = 0;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(first_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        first_match_addr += peerMemoryAtomSize;
+    }
+
+    found = false;
+    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    while(!found) {
+        for (auto range: vertex_ranges) {
+            if (range.contains(second_match_addr)) {
+                found = true;
+                break;
+            }
+        }
+        second_match_addr += peerMemoryAtomSize;
+    }
+
+    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+}
+
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -150,8 +183,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled()) &&
-                            (!pendingPushAlarm)) {
+                        if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
                         }
                     }
@@ -363,18 +395,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
+    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
-        // TODO: Fix this hack
         applyQueue.push_back(block_index);
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
     if ((!applyQueue.empty()) &&
-        (!pendingPushAlarm) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
@@ -393,14 +423,7 @@ CoalesceEngine::processNextApplyEvent()
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
         DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
-                    "needed. Adding the cache line to evict schedule.\n",
-                    __func__, block_index);
-        if (cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
-            assert(!evictQueue.empty());
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+                    "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
@@ -409,31 +432,38 @@ CoalesceEngine::processNextApplyEvent()
                                 cacheBlocks[block_index].items[i].tempProp);
 
             if (new_prop != old_prop) {
-                if (peerPushEngine->allocatePushSpace()) {
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n",
-                    __func__,
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-                    peerPushEngine->recvWLItem(
-                                        cacheBlocks[block_index].items[i]);
-                    DPRINTF(MPU, "%s: Sent WorkListItem [%d] to PushEngine.\n",
-                    __func__,
-                    cacheBlocks[block_index].addr + i * sizeof(WorkListItem));
+
+                Addr block_addr = cacheBlocks[block_index].addr;
+                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
+                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+                int bit_index = atom_index * block_bits + i;
+
+                if (needsApply[bit_index] == 1) {
+                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
+                                " Not doing anything further.\n", __func__,
+                                block_addr + (i * sizeof(WorkListItem)));
                 } else {
-                    peerPushEngine->setPushAlarm();
-                    pendingPushAlarm = true;
-                    return;
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsApply[bit_index] = 1;
+                    }
                 }
             }
         }
-        // TODO: This is where eviction policy goes
-        if (cacheBlocks[block_index].hasConflict){
-            evictQueue.push_back(block_index);
-            DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                    __func__, block_index, evictQueue.size());
-        }
+    }
+
+    // TODO: This is where eviction policy goes
+    if (cacheBlocks[block_index].hasConflict){
+        evictQueue.push_back(block_index);
+        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+                __func__, block_index, evictQueue.size());
     }
 
     applyQueue.pop_front();
@@ -536,9 +566,42 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::respondToPushAlarm()
 {
-    assert(pendingPushAlarm && (!nextApplyEvent.scheduled()));
-    pendingPushAlarm = false;
-    schedule(nextApplyEvent, nextCycle());
+    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    int it;
+    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        uint32_t slice = 0;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            slice <<= 1;
+            slice |= needsApply[it + i];
+        }
+        if (slice) {
+            break;
+        }
+    }
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+                __func__, slice, it);
+
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+
+    if ((cacheBlocks[block_index].addr == block_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        // hit in cache
+        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+        for (int i = 0; i < numElementsPerLine; i++) {
+            peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], do_push);
+        }
+
+        // TODO: Should we add block_index to evict_queue?
+        if (do_push && cacheBlocks[block_index].hasConflict) {
+            evictQueue.push_back(block_index);
+        }
+    } else {
+        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 1e353c11b8..e6c70502af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,12 +29,16 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
+#include <bitset>
+
 #include "accl/graph/base/base_mem_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
+#define MAX_BITVECTOR_SIZE (1 << 30)
+
 // TODO: Add parameters for size, memory atom size, type size,
 // length of items in the blocks.
 namespace gem5
@@ -68,6 +72,7 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+    int nmpu;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -83,8 +88,9 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    bool pendingPushAlarm;
     FIFOSet<int> applyQueue;
+    int needsApplyFirstPointer;
+    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
 
     FIFOSet<int> evictQueue;
 
@@ -127,14 +133,16 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    void recvFunctional(PacketPtr pkt);
-
     bool recvReadAddr(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
     void respondToPushAlarm();
+
+    void recvFunctional(PacketPtr pkt);
+
+    virtual void startup();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a045bbdead..8bc2d55a28 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
+#include "debug/SEGAQSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -37,9 +38,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    pushAlarmSet(false),
+    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
+    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -59,9 +61,11 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 }
 
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine)
+PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                    int elements_per_line)
 {
     peerCoalesceEngine = coalesce_engine;
+    numElementsPerLine = elements_per_line;
 }
 
 void
@@ -115,15 +119,21 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
-    panic_if(pushReqQueue.size() == pushReqQueueSize, "You should call this "
-                "method after checking if there is enough push space. Use "
-                "allocatePushSpace.\n");
+    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
+            (pushReqQueueSize != 0), "You should call this method after "
+            "checking if there is enough push space. Use allocatePushSpace.\n");
 
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge), peerMemoryAtomSize, value);
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+
+    if (curTick() % 50000 == 0) {
+        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
+                                __func__, pushReqQueue.size());
+    }
 
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
@@ -132,6 +142,25 @@ PushEngine::recvWLItem(WorkListItem wl)
     }
 }
 
+void
+PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+{
+    if (do_push) {
+        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+        uint32_t value = wl.prop;
+
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
+        numRetries--;
+    }
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled()) &&
+        (!memReqQueueFull())) {
+        schedule(nextAddrGenEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::processNextAddrGenEvent()
 {
@@ -158,8 +187,10 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        if (pushAlarmSet && (pushReqQueue.size() == pushReqQueueSize - 1)) {
-            pushAlarmSet = false;
+        if (numRetries > 0) {
+            retrySpaceAllocated++;
+        }
+        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
             peerCoalesceEngine->respondToPushAlarm();
         }
     }
@@ -261,17 +292,20 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-void
-PushEngine::setPushAlarm()
-{
-    assert(!pushAlarmSet);
-    pushAlarmSet = true;
+bool
+PushEngine::allocatePushSpace() {
+    if ((pushReqQueueSize == 0) ||
+        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        return true;
+    } else {
+        numRetries++;
+        return false;
+    }
 }
 
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2c17501d5b..4f388cd7e6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -98,13 +98,15 @@ class PushEngine : public BaseMemEngine
         virtual void recvReqRetry();
     };
 
-    bool pushAlarmSet;
+    int numElementsPerLine;
+    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
+    int numRetries;
     int pushReqQueueSize;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
@@ -151,12 +153,14 @@ class PushEngine : public BaseMemEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace() { return pushReqQueue.size() < pushReqQueueSize; }
+    bool allocatePushSpace();
+
     void recvWLItem(WorkListItem wl);
 
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine);
+    void recvWLItemRetry(WorkListItem wl, bool do_push);
 
-    void setPushAlarm();
+    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
+                                          int elements_per_line);
 };
 
 }

From 7a351854013b45cfe260990b60dbc160e1aac24a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 17 Jul 2022 16:12:07 -0700
Subject: [PATCH 110/287] Completing retry between coalesce and push engine.

---
 configs/accl/sega.py                   |   4 +-
 src/accl/graph/SConscript              |   1 +
 src/accl/graph/TODO.md                 |   7 +-
 src/accl/graph/base/base_mem_engine.cc |  13 ++-
 src/accl/graph/base/data_structs.hh    |   3 +-
 src/accl/graph/sega/coalesce_engine.cc | 155 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  12 ++
 src/accl/graph/sega/push_engine.cc     |  11 +-
 8 files changed, 157 insertions(+), 49 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 96408aa185..65645b3bb3 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,7 +9,7 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=0,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
                                     outstanding_mem_req_queue_size=1,
                                     resp_queue_size=1)
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=2)
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 9663d3f263..36e16affa3 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -29,3 +29,4 @@ Import('*')
 
 DebugFlag('MPU')
 DebugFlag('SEGAQSize')
+DebugFlag('MahyarMath')
diff --git a/src/accl/graph/TODO.md b/src/accl/graph/TODO.md
index 29b5a2939e..ebfca7e794 100644
--- a/src/accl/graph/TODO.md
+++ b/src/accl/graph/TODO.md
@@ -1,5 +1,8 @@
 # TODO Items
 
-* Replace std::floor with roundDown from intmath.hh in src
 * We might need to revisit the fact that we could insert something to a queue on
-    the same cycle that another event is consuming something from the queue.
\ No newline at end of file
+    the same cycle that another event is consuming something from the queue.
+* Move checking for wl.degree == 0 to coalesce engine.
+* Fix the retry system between memory queue and coalesce engine
+* Update inheritance: There is not enough reason for PushEngine and
+CoalesceEngine to be of the same type (i.e. delete BaseMemEngine).
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 3086b81fc2..64aaa3a737 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -159,17 +159,22 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemReqSpace(int space)
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
     return (
-        outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space)
+        (outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memReqQueueFull()
 {
-    assert(outstandingMemReqQueue.size() <= outstandingMemReqQueueSize);
-    return (outstandingMemReqQueue.size() == outstandingMemReqQueueSize);
+    assert((outstandingMemReqQueueSize == 0) ||
+        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    return (
+        (outstandingMemReqQueueSize != 0) &&
+        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
 }
 
 void
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index e30d6029cb..9c250c6a2f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,6 +93,7 @@ template<typename T>
 class FIFOSet
 {
   private:
+    // int numInvalids;
     std::queue<T> fifo;
     std::unordered_set<T> set;
 
@@ -127,7 +128,7 @@ class FIFOSet
     }
 
     bool empty() {
-        return fifo.empty();
+        return (size() == 0);
     }
 
     bool find(T item) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b5eeae694e..1c3f2bcadf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -75,29 +76,39 @@ CoalesceEngine::startup()
 
     bool found = false;
     Addr first_match_addr = 0;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(first_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         first_match_addr += peerMemoryAtomSize;
     }
 
     found = false;
     Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(!found) {
+    while(true) {
         for (auto range: vertex_ranges) {
             if (range.contains(second_match_addr)) {
                 found = true;
                 break;
             }
         }
+        if (found) {
+            break;
+        }
         second_match_addr += peerMemoryAtomSize;
     }
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    memoryAddressOffset = first_match_addr;
+    DPRINTF(MahyarMath, "%s: Initialized address translation information."
+                        " nmpu: %d, memoryAddressOffset: %lu.\n",
+                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -106,6 +117,40 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
+                        __func__, addr);
+    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    int bit_index = atom_index * block_bits;
+    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
+                        __func__, addr, bit_index);
+    return bit_index;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
+                        __func__, index);
+    Addr block_addr = (nmpu * peerMemoryAtomSize) *
+        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
+    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
+                        __func__, index, (block_addr + memoryAddressOffset));
+    return (block_addr + memoryAddressOffset);
+}
+
 bool
 CoalesceEngine::recvReadAddr(Addr addr)
 {
@@ -298,6 +343,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    if (pkt->findNextSenderState<SenderState>()) {
+        Addr addr = pkt->getAddr();
+        int it = getBitIndexBase(addr);
+        int block_index = getBlockIndex(addr);
+        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
+
+        // We have to send the items regardless of them being found in the
+        // cache. However, if they are found in the cache, two things should
+        // happen. First, do_push should be set to false and the bit vector
+        // value for the items should not change. To future Mahyar and Marjan,
+        // If this is confusing, please look at where each item is pushed to
+        // the apply queue. Hint: Think about updates that might not be sent
+        // out if you reset the bit regardless of the line being found in the
+        // cache.
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[it + i] =
+                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
+
+            peerPushEngine->recvWLItemRetry(items[i],
+                ((!found_in_cache) && needsApply[it + i]));
+        }
+        return true;
+    }
+
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
@@ -395,11 +465,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {(aligned_addr / peerMemoryAtomSize) % numLines;
+    if ((cacheBlocks[block_index].busyMask == 0)) {
         DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
+        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            needsApply[bit_index + i] = 0;
+        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -438,22 +512,15 @@ CoalesceEngine::processNextApplyEvent()
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
 
-                Addr block_addr = cacheBlocks[block_index].addr;
-                int atom_index = (int) (block_addr / (peerMemoryAtomSize * nmpu));
-                int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-                int bit_index = atom_index * block_bits + i;
+                int bit_index =
+                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
 
-                if (needsApply[bit_index] == 1) {
-                    DPRINTF(MPU, "%s: WorkListItem[%lu] already set in bit-vector."
-                                " Not doing anything further.\n", __func__,
-                                block_addr + (i * sizeof(WorkListItem)));
+                assert(needsApply[bit_index] == 0);
+                if (peerPushEngine->allocatePushSpace()) {
+                    peerPushEngine->recvWLItem(
+                        cacheBlocks[block_index].items[i]);
                 } else {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
-                    } else {
-                        needsApply[bit_index] = 1;
-                    }
+                    needsApply[bit_index] = 1;
                 }
             }
         }
@@ -567,40 +634,56 @@ void
 CoalesceEngine::respondToPushAlarm()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
-    int it;
+    Addr block_addr = 0;
+    int block_index = 0;
+    int it = 0;
+    uint32_t slice = 0;
+    bool hit_in_cache = false;
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        uint32_t slice = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsApply[it + i];
         }
         if (slice) {
-            break;
+            block_addr = getBlockAddrFromBitIndex(it);
+            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            if ((cacheBlocks[block_index].addr == block_addr) &&
+                (cacheBlocks[block_index].valid)) {
+                if (cacheBlocks[block_index].busyMask == 0) {
+                    hit_in_cache = true;
+                    break;
+                }
+            } else {
+                hit_in_cache = false;
+                break;
+            }
         }
     }
+
+    assert(it < MAX_BITVECTOR_SIZE);
+
     DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
                 __func__, slice, it);
 
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-                ((int)(it / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    int block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
-
-    if ((cacheBlocks[block_index].addr == block_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        // hit in cache
-        bool do_push = cacheBlocks[block_index].busyMask == 0 ? true : false;
+    if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
-            peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], do_push);
-        }
-
-        // TODO: Should we add block_index to evict_queue?
-        if (do_push && cacheBlocks[block_index].hasConflict) {
-            evictQueue.push_back(block_index);
+            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
+                                                    (needsApply[it + i] == 1));
+            needsApply[it + i] = 0;
         }
     } else {
+        // FIXME: Fix the retry mechanism between memory and cache to
+        // handle memory retries correctly. This probably requires scheduling
+        // an event for sending the retry. For now we're enabling infinite
+        // queueing in the outstandingMemReqQueue.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-
+        SenderState* sender_state = new SenderState(true);
+        pkt->pushSenderState(sender_state);
+        if (allocateMemReqSpace(1)) {
+            enqueueMemReq(pkt);
+        } else {
+            requestMemAlarm(1);
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e6c70502af..973ea479c1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -72,7 +72,15 @@ class CoalesceEngine : public BaseMemEngine
           items = new WorkListItem [num_elements];
         }
     };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+
     int nmpu;
+    Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -94,6 +102,10 @@ class CoalesceEngine : public BaseMemEngine
 
     FIFOSet<int> evictQueue;
 
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 8bc2d55a28..fa611392b4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -149,9 +149,13 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
+        if (wl.degree != 0) {
+            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                            peerMemoryAtomSize, value);
+        } else {
+            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
+                    __func__, wl.to_string());
+        }
         numRetries--;
     }
     retrySpaceAllocated--;
@@ -164,7 +168,6 @@ PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 void
 PushEngine::processNextAddrGenEvent()
 {
-
     Addr aligned_addr, offset;
     int num_edges;
 

From 2b9604dc53c675f1e4fc943c162e43929ff0af27 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 07:33:16 -0700
Subject: [PATCH 111/287] Updating variable names and debug flags.

---
 src/accl/graph/SConscript              |   3 +-
 src/accl/graph/base/base_mem_engine.cc |  20 ++---
 src/accl/graph/base/base_mem_engine.hh |  12 +--
 src/accl/graph/base/data_structs.hh    |  33 +++-----
 src/accl/graph/sega/SConscript         |   3 +
 src/accl/graph/sega/coalesce_engine.cc | 100 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |   9 +--
 src/accl/graph/sega/push_engine.cc     |  53 ++++++-------
 src/accl/graph/sega/push_engine.hh     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 +-
 10 files changed, 113 insertions(+), 124 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 36e16affa3..7ca60c30bd 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,4 @@
 Import('*')
 
 DebugFlag('MPU')
-DebugFlag('SEGAQSize')
-DebugFlag('MahyarMath')
+# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 64aaa3a737..32c314033d 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,7 +40,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
-    memAlarmRequested(false),
+    memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
     _requestorId(system->getRequestorId(this)),
@@ -111,12 +111,12 @@ BaseMemEngine::processNextMemReqEvent()
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
 
-        if (memAlarmRequested &&
+        if (memRetryRequested &&
             (outstandingMemReqQueue.size() <=
             (outstandingMemReqQueueSize - memSpaceRequested))) {
-            memAlarmRequested = false;
+            memRetryRequested = false;
             memSpaceRequested = 0;
-            respondToMemAlarm();
+            recvMemRetry();
         }
     }
 
@@ -157,7 +157,7 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 }
 
 bool
-BaseMemEngine::allocateMemReqSpace(int space)
+BaseMemEngine::allocateMemQueueSpace(int space)
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -168,7 +168,7 @@ BaseMemEngine::allocateMemReqSpace(int space)
 }
 
 bool
-BaseMemEngine::memReqQueueFull()
+BaseMemEngine::memQueueFull()
 {
     assert((outstandingMemReqQueueSize == 0) ||
         (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
@@ -180,7 +180,7 @@ BaseMemEngine::memReqQueueFull()
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
-    panic_if(memReqQueueFull(), "Should not enqueue if queue full.\n");
+    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
     assert(!outstandingMemReqQueue.empty());
@@ -190,12 +190,12 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 }
 
 void
-BaseMemEngine::requestMemAlarm(int space) {
-    panic_if((memAlarmRequested == true) || (memSpaceRequested != 0),
+BaseMemEngine::requestMemRetry(int space) {
+    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
     DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memAlarmRequested = true;
+    memRetryRequested = true;
     memSpaceRequested = space;
 }
 
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index fc67f3f6d8..64ef49ee1d 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -71,7 +71,7 @@ class BaseMemEngine : public ClockedObject
     int outstandingMemReqQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
-    bool memAlarmRequested;
+    bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> outstandingMemReqQueue;
 
@@ -83,17 +83,17 @@ class BaseMemEngine : public ClockedObject
 
     size_t peerMemoryAtomSize;
 
-    bool allocateMemReqSpace(int space);
-    bool memReqQueueFull();
+    bool allocateMemQueueSpace(int space);
+    bool memQueueFull();
 
-    bool pendingMemAlarm() { return memAlarmRequested; }
-    void requestMemAlarm(int space);
+    bool pendingMemRetry() { return memRetryRequested; }
+    void requestMemRetry(int space);
 
     void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
     void enqueueMemReq(PacketPtr pkt);
 
     virtual int respBuffSize() = 0;
-    virtual void respondToMemAlarm() = 0;
+    virtual void recvMemRetry() = 0;
     virtual bool handleMemResp(PacketPtr pkt) = 0;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 9c250c6a2f..f938be72f1 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -93,8 +93,6 @@ template<typename T>
 class FIFOSet
 {
   private:
-    // int numInvalids;
-    std::queue<T> fifo;
     std::unordered_set<T> set;
 
   public:
@@ -107,24 +105,22 @@ class FIFOSet
     {
         if (set.find(item) == set.end()) {
             set.insert(item);
-            fifo.push(item);
         }
     }
 
     void pop_front()
     {
-        T front = fifo.front();
-        set.erase(front);
-        fifo.pop();
+        assert(set.begin() != set.end());
+        set.erase(set.begin());
     }
 
-    T& front()
+    T front()
     {
-        return fifo.front();
+        return *(set.begin());
     }
 
     size_t size() {
-        return fifo.size();
+        return set.size();
     }
 
     bool empty() {
@@ -134,22 +130,11 @@ class FIFOSet
     bool find(T item) {
         return (set.find(item) != set.end());
     }
-};
-
-// template<int SIZE>
-// class BitVector
-// {
-//   private:
-//     int it;
-//     std::bitset<SIZE> bitStore;
-
-//   public:
-//     BitVector(): it(0) { bitStore.reset(); }
 
-//     uint32_t next() {
-
-//     }
-// };
+    void erase(T item) {
+        set.erase(item);
+    }
+};
 
 }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 16fab86ede..77e508f4ed 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,3 +39,6 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
+DebugFlag('CoalesceEngine')
+DebugFlag('PushEngine')
+DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1c3f2bcadf..66b8e1fad7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/MahyarMath.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -60,7 +59,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
 
     peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
 
-    needsApply.reset();
+    needsPush.reset();
 }
 
 void
@@ -106,9 +105,6 @@ CoalesceEngine::startup()
 
     nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
     memoryAddressOffset = first_match_addr;
-    DPRINTF(MahyarMath, "%s: Initialized address translation information."
-                        " nmpu: %d, memoryAddressOffset: %lu.\n",
-                        __func__, nmpu, memoryAddressOffset);
 }
 
 void
@@ -128,13 +124,9 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BitIndexBase for addr %lu.\n",
-                        __func__, addr);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
-    DPRINTF(MahyarMath, "%s: BitIndexBase for addr %lu is %d.\n",
-                        __func__, addr, bit_index);
     return bit_index;
 }
 
@@ -142,17 +134,13 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
-    DPRINTF(MahyarMath, "%s: Calculating BlockAddr for index %d.\n",
-                        __func__, index);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    DPRINTF(MahyarMath, "%s: BlockAddr for index %d is %lu.\n",
-                        __func__, index, (block_addr + memoryAddressOffset));
     return (block_addr + memoryAddressOffset);
 }
 
 bool
-CoalesceEngine::recvReadAddr(Addr addr)
+CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
     DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
@@ -239,7 +227,7 @@ CoalesceEngine::recvReadAddr(Addr addr)
                     DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memReqQueueFull()) {
+                    if (memQueueFull()) {
                         DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
@@ -326,7 +314,7 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::respondToMemAlarm()
+CoalesceEngine::recvMemRetry()
 {
     assert(!nextEvictEvent.scheduled());
     schedule(nextEvictEvent, nextCycle());
@@ -347,8 +335,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
-        bool found_in_cache = (cacheBlocks[block_index].addr == addr);
 
+        bool line_do_push = false;
+        if (cacheBlocks[block_index].addr == addr) {
+            if (cacheBlocks[block_index].busyMask == 0) {
+                assert(applyQueue.find(block_index));
+                line_do_push = true;
+            } else {
+                line_do_push = false;
+            }
+        }
         // We have to send the items regardless of them being found in the
         // cache. However, if they are found in the cache, two things should
         // happen. First, do_push should be set to false and the bit vector
@@ -359,11 +355,19 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // cache.
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[it + i] =
-                (needsApply[it + i] == 1) && found_in_cache ? 1 : 0;
-
+            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(items[i],
-                ((!found_in_cache) && needsApply[it + i]));
+                (line_do_push && needsPush[it + i]));
+        }
+
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
         return true;
     }
@@ -470,10 +474,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        int bit_index = getBitIndexBase(cacheBlocks[block_index].addr);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            needsApply[bit_index + i] = 0;
-        }
         DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
@@ -488,6 +488,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
+    if (applyQueue.empty()) {
+        return;
+    }
+
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -514,13 +518,13 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-
-                assert(needsApply[bit_index] == 0);
-                if (peerPushEngine->allocatePushSpace()) {
-                    peerPushEngine->recvWLItem(
-                        cacheBlocks[block_index].items[i]);
-                } else {
-                    needsApply[bit_index] = 1;
+                if (cacheBlocks[block_index].items[i].degree != 0) {
+                    if (peerPushEngine->allocatePushSpace()) {
+                        peerPushEngine->recvWLItem(
+                            cacheBlocks[block_index].items[i]);
+                    } else {
+                        needsPush[bit_index] = 1;
+                    }
                 }
             }
         }
@@ -536,7 +540,7 @@ CoalesceEngine::processNextApplyEvent()
     applyQueue.pop_front();
 
     if ((!evictQueue.empty()) &&
-        (!pendingMemAlarm()) &&
+        (!pendingMemRetry()) &&
         (!nextEvictEvent.scheduled())) {
         schedule(nextEvictEvent, nextCycle());
     }
@@ -562,13 +566,13 @@ CoalesceEngine::processNextEvictEvent()
         int space_needed = cacheBlocks[block_index].dirty ?
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemReqSpace(space_needed)) {
+        if (!allocateMemQueueSpace(space_needed)) {
             DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
                     cacheBlocks[block_index].hasConflict);
-            requestMemAlarm(space_needed);
+            requestMemRetry(space_needed);
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
@@ -631,7 +635,7 @@ CoalesceEngine::processNextEvictEvent()
 }
 
 void
-CoalesceEngine::respondToPushAlarm()
+CoalesceEngine::recvPushRetry()
 {
     DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
     Addr block_addr = 0;
@@ -639,14 +643,15 @@ CoalesceEngine::respondToPushAlarm()
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
+
     for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
-            slice |= needsApply[it + i];
+            slice |= needsPush[it + i];
         }
         if (slice) {
             block_addr = getBlockAddrFromBitIndex(it);
-            block_index = ((int) (block_addr / peerMemoryAtomSize)) % numLines;
+            block_index = getBlockIndex(block_addr);
             if ((cacheBlocks[block_index].addr == block_addr) &&
                 (cacheBlocks[block_index].valid)) {
                 if (cacheBlocks[block_index].busyMask == 0) {
@@ -662,14 +667,23 @@ CoalesceEngine::respondToPushAlarm()
 
     assert(it < MAX_BITVECTOR_SIZE);
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsApply.\n",
+    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
                 __func__, slice, it);
 
     if (hit_in_cache) {
         for (int i = 0; i < numElementsPerLine; i++) {
+            // TODO: Make this more programmable
+            uint32_t new_prop = std::min(
+                                cacheBlocks[block_index].items[i].prop,
+                                cacheBlocks[block_index].items[i].tempProp);
+            cacheBlocks[block_index].items[i].tempProp = new_prop;
+            cacheBlocks[block_index].items[i].prop = new_prop;
             peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsApply[it + i] == 1));
-            needsApply[it + i] = 0;
+                                                    (needsPush[it + i] == 1));
+            needsPush[it + i] = 0;
+        }
+        if (applyQueue.find(block_index)) {
+            applyQueue.erase(block_index);
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -679,10 +693,10 @@ CoalesceEngine::respondToPushAlarm()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemReqSpace(1)) {
+        if (allocateMemQueueSpace(1)) {
             enqueueMemReq(pkt);
         } else {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 973ea479c1..0fa555c84a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -97,8 +97,7 @@ class CoalesceEngine : public BaseMemEngine
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     FIFOSet<int> applyQueue;
-    int needsApplyFirstPointer;
-    std::bitset<MAX_BITVECTOR_SIZE> needsApply;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     FIFOSet<int> evictQueue;
 
@@ -137,7 +136,7 @@ class CoalesceEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return -1; }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
@@ -145,12 +144,12 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceEngine(const CoalesceEngineParams &params);
 
-    bool recvReadAddr(Addr addr);
+    bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     void registerWLEngine(WLEngine* wl_engine);
 
-    void respondToPushAlarm();
+    void recvPushRetry();
 
     void recvFunctional(PacketPtr pkt);
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index fa611392b4..16e0ca6c6c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,7 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/MPU.hh"
-#include "debug/SEGAQSize.hh"
+#include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -109,13 +109,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
-    // If there are no outdoing edges, no need to generate and push
-    // updates. Therefore, we only need to return true.
-    if (wl.degree == 0) {
-        DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        return;
-    }
+    assert(wl.degree != 0);
 
     assert((pushReqQueueSize == 0) ||
         (pushReqQueue.size() < pushReqQueueSize));
@@ -123,6 +117,7 @@ PushEngine::recvWLItem(WorkListItem wl)
             (pushReqQueueSize != 0), "You should call this method after "
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
+    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
     Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
@@ -130,14 +125,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    if (curTick() % 50000 == 0) {
-        DPRINTF(SEGAQSize, "%s: pushReqQueue.size: %lu.\n",
-                                __func__, pushReqQueue.size());
-    }
-
     assert(!pushReqQueue.empty());
     if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
+        (!memQueueFull())) {
         schedule(nextAddrGenEvent, nextCycle());
     }
 }
@@ -145,24 +135,22 @@ PushEngine::recvWLItem(WorkListItem wl)
 void
 PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
 {
+    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
+                __func__, wl.to_string(), do_push ? "true" : "false");
     if (do_push) {
         Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
         Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
         uint32_t value = wl.prop;
-        if (wl.degree != 0) {
-            pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                            peerMemoryAtomSize, value);
-        } else {
-            DPRINTF(MPU, "%s: Received a leaf. Respective information: %s.\n",
-                    __func__, wl.to_string());
-        }
+        assert(wl.degree != 0);
+        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                        peerMemoryAtomSize, value);
         numRetries--;
+        if ((!nextAddrGenEvent.scheduled()) &&
+            (!memQueueFull())) {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
     retrySpaceAllocated--;
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memReqQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
-    }
 }
 
 void
@@ -173,7 +161,7 @@ PushEngine::processNextAddrGenEvent()
 
     PushPacketInfoGen &curr_info = pushReqQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(MPU, "%s: Current packet information generated by "
+    DPRINTF(PushEngine, "%s: Current packet information generated by "
                 "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
                 "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
@@ -185,22 +173,22 @@ PushEngine::processNextAddrGenEvent()
     enqueueMemReq(pkt);
 
     if (curr_info.done()) {
-        DPRINTF(MPU, "%s: Current PushPacketInfoGen is done.\n", __func__);
+        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
         pushReqQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped curr_info from pushReqQueue. "
+        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
         if (numRetries > 0) {
             retrySpaceAllocated++;
         }
         if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->respondToPushAlarm();
+            peerCoalesceEngine->recvPushRetry();
         }
     }
 
-    if (memReqQueueFull()) {
+    if (memQueueFull()) {
         if (!pushReqQueue.empty()) {
-            requestMemAlarm(1);
+            requestMemRetry(1);
         }
         return;
     }
@@ -211,9 +199,10 @@ PushEngine::processNextAddrGenEvent()
 }
 
 void
-PushEngine::respondToMemAlarm()
+PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
+    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4f388cd7e6..11122067d6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -143,7 +143,7 @@ class PushEngine : public BaseMemEngine
 
   protected:
     virtual int respBuffSize() { return memRespQueue.size(); }
-    virtual void respondToMemAlarm();
+    virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 55a9147ac9..27ba5c40c8 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -139,7 +139,7 @@ WLEngine::processNextReadEvent()
             DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
                         "onTheFlyUpdateMap.size: %lu.\n",
                         __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From 86a72bc496be523600caf672cdd24c14ba484603 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 19 Jul 2022 14:33:22 -0700
Subject: [PATCH 112/287] Somewhat fixing the correctness.

---
 src/accl/graph/sega/coalesce_engine.cc | 97 +++++++++++++++++---------
 src/accl/graph/sega/push_engine.cc     |  3 +-
 2 files changed, 65 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66b8e1fad7..274d85a5b1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -117,6 +117,7 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     return ((int) (addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -124,6 +125,7 @@ CoalesceEngine::getBlockIndex(Addr addr)
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
+    assert((addr % peerMemoryAtomSize) == 0);
     int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
     int bit_index = atom_index * block_bits;
@@ -134,6 +136,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
     Addr block_addr = (nmpu * peerMemoryAtomSize) *
         ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
     return (block_addr + memoryAddressOffset);
@@ -336,39 +339,62 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         int it = getBitIndexBase(addr);
         int block_index = getBlockIndex(addr);
 
-        bool line_do_push = false;
-        if (cacheBlocks[block_index].addr == addr) {
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            // We read the address to send the wl but it is put in cache before
+            // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
-                assert(applyQueue.find(block_index));
-                line_do_push = true;
+                // It is not busy anymore, we have to send the wl from cache.
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    // TODO: Make this more programmable
+                    uint32_t new_prop = std::min(
+                                        cacheBlocks[block_index].items[i].prop,
+                                        cacheBlocks[block_index].items[i].tempProp);
+                    cacheBlocks[block_index].items[i].tempProp = new_prop;
+                    cacheBlocks[block_index].items[i].prop = new_prop;
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    needsPush[it + i] = 0;
+                }
+                // Since we have just applied the line, we can take it out of
+                // the applyQueue if it's in there. No need to do the same
+                // thing for evictQueue.
+                if (applyQueue.find(block_index)) {
+                    applyQueue.erase(block_index);
+                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                        deschedule(nextApplyEvent);
+                    }
+                }
             } else {
-                line_do_push = false;
+                // The line is busy. Therefore, we have to disregard the data
+                // we received from the memory and also tell the push engine to
+                // deallocate the space it allocated for this retry. However,
+                // we still have to rememeber that these items need a retry.
+                // i.e. don't change needsPush, call recvWLItemRetry with
+                // do_push = false
+                for (int i = 0; i < numElementsPerLine; i++) {
+                    assert(!((needsPush[it + i] == 1) &&
+                            (cacheBlocks[block_index].items[i].degree == 0)));
+                    peerPushEngine->recvWLItemRetry(
+                                    cacheBlocks[block_index].items[i], false);
+                }
+            }
+        } else {
+            // We have read the address to send the wl and it is not in the
+            // cache. Simply send the items to the PushEngine.
+            WorkListItem* items = pkt->getPtr<WorkListItem>();
+            // No applying of the line needed.
+            for (int i = 0; i < numElementsPerLine; i++) {
+                assert(!((needsPush[it + i] == 1) &&
+                                (items[i].degree == 0)));
+                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                needsPush[it + i] = 0;
             }
-        }
-        // We have to send the items regardless of them being found in the
-        // cache. However, if they are found in the cache, two things should
-        // happen. First, do_push should be set to false and the bit vector
-        // value for the items should not change. To future Mahyar and Marjan,
-        // If this is confusing, please look at where each item is pushed to
-        // the apply queue. Hint: Think about updates that might not be sent
-        // out if you reset the bit regardless of the line being found in the
-        // cache.
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
-        for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) && (items[i].degree == 0)));
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(items[i],
-                (line_do_push && needsPush[it + i]));
         }
 
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-        }
+        delete pkt;
         return true;
     }
 
@@ -488,9 +514,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    if (applyQueue.empty()) {
-        return;
-    }
+    // if (applyQueue.empty()) {
+    //     return;
+    // }
 
     int block_index = applyQueue.front();
 
@@ -515,10 +541,12 @@ CoalesceEngine::processNextApplyEvent()
                 DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
                     cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
                     cacheBlocks[block_index].items[i].to_string());
-
                 int bit_index =
                         getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if (cacheBlocks[block_index].items[i].degree != 0) {
+                if ((cacheBlocks[block_index].items[i].degree != 0) &&
+                    (needsPush[bit_index] == 0)) {
+                    // If the respective bit in the bit vector is set
+                    // there is no need to try and resend it.
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
                             cacheBlocks[block_index].items[i]);
@@ -684,6 +712,9 @@ CoalesceEngine::recvPushRetry()
         }
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
+            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                deschedule(nextApplyEvent);
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 16e0ca6c6c..044429f8fc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -97,12 +97,11 @@ PushEngine::ReqPort::recvReqRetry()
     sendPacket(blockedPacket);
 
     if (!_blocked) {
+        blockedPacket = nullptr;
         DPRINTF(MPU, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
-
-        blockedPacket = nullptr;
     }
 }
 

From 9f4c1f31be4bf999b1b525e604999d529f33e41b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 01:31:49 -0700
Subject: [PATCH 113/287] Almost fixed retry bugs. 14 wrong vertices in lj.

---
 configs/accl/sega.py                   |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  18 ++---
 src/accl/graph/sega/coalesce_engine.cc |  95 ++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |   5 ++
 src/accl/graph/sega/push_engine.cc     | 101 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     |   4 +-
 6 files changed, 170 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 65645b3bb3..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -174,5 +174,5 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print("Simulation finished!")
+    print(f"Exited simulation because {exit_event.getCause()}")
     exit()
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index 32c314033d..e05357950b 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -97,12 +97,8 @@ BaseMemEngine::MemPort::recvReqRetry()
 void
 BaseMemEngine::processNextMemReqEvent()
 {
-    if (memPort.blocked()) {
-        return;
-    }
-
-    if (((respBuffSize() + onTheFlyReqs) < respQueueSize) ||
-        (respQueueSize == 0)) {
+    if ((respQueueSize == 0) ||
+        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
@@ -120,7 +116,8 @@ BaseMemEngine::processNextMemReqEvent()
         }
     }
 
-    if ((!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+    if ((!memPort.blocked()) &&
+        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -183,8 +180,7 @@ BaseMemEngine::enqueueMemReq(PacketPtr pkt)
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
     outstandingMemReqQueue.push_back(pkt);
 
-    assert(!outstandingMemReqQueue.empty());
-    if (!nextMemReqEvent.scheduled()) {
+    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -202,8 +198,8 @@ BaseMemEngine::requestMemRetry(int space) {
 void
 BaseMemEngine::wakeUp()
 {
-    if ((!nextMemReqEvent.scheduled()) &&
-        (!outstandingMemReqQueue.empty())) {
+    assert(!nextMemReqEvent.scheduled());
+    if (!outstandingMemReqQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 274d85a5b1..dde6e46aa9 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -31,6 +31,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
@@ -44,11 +45,14 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntry(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    currentBitSliceIndex(0),
+    numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -344,6 +348,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // We read the address to send the wl but it is put in cache before
             // the read response arrives.
             if (cacheBlocks[block_index].busyMask == 0) {
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as idle.\n",
+                        __func__, addr);
+                int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
@@ -354,10 +362,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                         cacheBlocks[block_index].items[i].tempProp);
                     cacheBlocks[block_index].items[i].tempProp = new_prop;
                     cacheBlocks[block_index].items[i].prop = new_prop;
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i], needsPush[it + i]);
+                    if (needsPush[it + i] == 1) {
+                        peerPushEngine->recvWLItemRetry(
+                            cacheBlocks[block_index].items[i]);
+                    }
+                    push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                peerPushEngine->deallocatePushSpace(
+                                        numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -366,6 +379,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                         deschedule(nextApplyEvent);
                     }
+                    if (cacheBlocks[block_index].hasConflict) {
+                        evictQueue.push_back(block_index);
+                        if ((!nextEvictEvent.scheduled()) &&
+                            (!pendingMemRetry())) {
+                            schedule(nextEvictEvent, nextCycle());
+                        }
+                    }
                 }
             } else {
                 // The line is busy. Therefore, we have to disregard the data
@@ -374,24 +394,31 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // we still have to rememeber that these items need a retry.
                 // i.e. don't change needsPush, call recvWLItemRetry with
                 // do_push = false
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    peerPushEngine->recvWLItemRetry(
-                                    cacheBlocks[block_index].items[i], false);
-                }
+                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was found in the cache as busy.\n",
+                        __func__, addr);
+                peerPushEngine->deallocatePushSpace(numElementsPerLine);
             }
         } else {
             // We have read the address to send the wl and it is not in the
             // cache. Simply send the items to the PushEngine.
+            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                        "for addr %lu. It was not found in the cache.\n",
+                        __func__, addr);
             WorkListItem* items = pkt->getPtr<WorkListItem>();
+            int push_needed = 0;
             // No applying of the line needed.
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
-                peerPushEngine->recvWLItemRetry(items[i], needsPush[it + i]);
+                if (needsPush[it + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(items[i]);
+                }
+                push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            peerPushEngine->deallocatePushSpace(
+                                    numElementsPerLine - push_needed);
         }
 
         delete pkt;
@@ -514,10 +541,6 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextApplyEvent()
 {
-    // if (applyQueue.empty()) {
-    //     return;
-    // }
-
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
@@ -665,14 +688,23 @@ CoalesceEngine::processNextEvictEvent()
 void
 CoalesceEngine::recvPushRetry()
 {
-    DPRINTF(MPU, "%s: Received a Push alarm.\n", __func__);
+    numRetriesReceived++;
+    if (!nextSendRetryEvent.scheduled()) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextSendRetryEvent()
+{
+    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -691,14 +723,23 @@ CoalesceEngine::recvPushRetry()
                 break;
             }
         }
+        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
+            it = 0;
+        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
+    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+        currentBitSliceIndex = 0;
+    } else {
+        currentBitSliceIndex = it + numElementsPerLine;
+    }
 
-    DPRINTF(MPU, "%s: Found slice %u at %d position in needsPush.\n",
-                __func__, slice, it);
+    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+                        "in needsPush.\n", __func__, slice, it);
 
     if (hit_in_cache) {
+        int push_needed = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -706,15 +747,26 @@ CoalesceEngine::recvPushRetry()
                                 cacheBlocks[block_index].items[i].tempProp);
             cacheBlocks[block_index].items[i].tempProp = new_prop;
             cacheBlocks[block_index].items[i].prop = new_prop;
-            peerPushEngine->recvWLItemRetry(cacheBlocks[block_index].items[i],
-                                                    (needsPush[it + i] == 1));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(
+                    cacheBlocks[block_index].items[i]);
+            }
+            push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
                 deschedule(nextApplyEvent);
             }
+            if (cacheBlocks[block_index].hasConflict) {
+                evictQueue.push_back(block_index);
+                if ((!nextEvictEvent.scheduled()) &&
+                    (!pendingMemRetry())) {
+                    schedule(nextEvictEvent, nextCycle());
+                }
+            }
         }
     } else {
         // FIXME: Fix the retry mechanism between memory and cache to
@@ -730,6 +782,11 @@ CoalesceEngine::recvPushRetry()
             requestMemRetry(1);
         }
     }
+
+    numRetriesReceived--;
+    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
+        schedule(nextSendRetryEvent, nextCycle());
+    }
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0fa555c84a..e1033a4622 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,6 +96,8 @@ class CoalesceEngine : public BaseMemEngine
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
+    int currentBitSliceIndex;
+    int numRetriesReceived;
     FIFOSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -114,6 +116,9 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextEvictEvent;
     void processNextEvictEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 044429f8fc..d493b34c53 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -105,6 +105,35 @@ PushEngine::ReqPort::recvReqRetry()
     }
 }
 
+void
+PushEngine::deallocatePushSpace(int space)
+{
+    retrySpaceAllocated -= space;
+    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
+            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
+            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
+            __func__, space, numRetries,
+            nextAddrGenEvent.scheduled() ? "true" : "false",
+            pendingMemRetry() ? "true" : "false",
+            pushReqQueue.size(), retrySpaceAllocated);
+    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
+    // and or the pushReqQueue is empty. If so we might need to
+    // send retries.
+    if ((numRetries > 0)  &&
+        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+        int free_space =
+            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        if (free_space > numElementsPerLine) {
+            DPRINTF(PushEngine, "%s: Found %d free spaces. "
+                    "retrySpaceAllocated = %d.\n", __func__, free_space,
+                    retrySpaceAllocated);
+            retrySpaceAllocated += numElementsPerLine;
+            peerCoalesceEngine->recvPushRetry();
+        }
+    }
+}
+
 void
 PushEngine::recvWLItem(WorkListItem wl)
 {
@@ -124,32 +153,41 @@ PushEngine::recvWLItem(WorkListItem wl)
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
 
-    assert(!pushReqQueue.empty());
-    if ((!nextAddrGenEvent.scheduled()) &&
-        (!memQueueFull())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
+            schedule(nextAddrGenEvent, nextCycle());
+        }
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl, bool do_push)
+PushEngine::recvWLItemRetry(WorkListItem wl)
 {
-    DPRINTF(PushEngine, "%s: Received %s with do_push = %s.\n",
-                __func__, wl.to_string(), do_push ? "true" : "false");
-    if (do_push) {
-        Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
-        Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-        uint32_t value = wl.prop;
-        assert(wl.degree != 0);
-        pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                        peerMemoryAtomSize, value);
-        numRetries--;
-        if ((!nextAddrGenEvent.scheduled()) &&
-            (!memQueueFull())) {
+    assert(wl.degree != 0);
+    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
+                                __func__, wl.to_string());
+
+    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    uint32_t value = wl.prop;
+
+    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                                    peerMemoryAtomSize, value);
+    numRetries--;
+    retrySpaceAllocated--;
+    if ((!nextAddrGenEvent.scheduled())) {
+        if (memQueueFull()) {
+            if (!pendingMemRetry()) {
+                requestMemRetry(1);
+            }
+        } else {
             schedule(nextAddrGenEvent, nextCycle());
         }
     }
-    retrySpaceAllocated--;
 }
 
 void
@@ -177,11 +215,27 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if ((numRetries > 0) &&
+        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        //     retrySpaceAllocated++;
+        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
+        //                     "retrySpaceAllocated = %d.\n",
+        //                     __func__, retrySpaceAllocated);
+        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
         if (numRetries > 0) {
-            retrySpaceAllocated++;
-        }
-        if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-            peerCoalesceEngine->recvPushRetry();
+            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+            DPRINTF(PushEngine, "%s: Found %d free spaces in "
+                            "the pushReqQueue.\n", __func__, free_space);
+            if (free_space > numElementsPerLine) {
+                retrySpaceAllocated += numElementsPerLine;
+                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+                        "retrySpaceAllocated = %d.\n", __func__, free_space,
+                        retrySpaceAllocated);
+                peerCoalesceEngine->recvPushRetry();
+            }
         }
     }
 
@@ -201,7 +255,7 @@ void
 PushEngine::recvMemRetry()
 {
     assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Responding to a memory alarm.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
     schedule(nextAddrGenEvent, nextCycle());
 }
 
@@ -285,6 +339,7 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
+    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
         return true;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 11122067d6..9025ae9946 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -155,9 +155,11 @@ class PushEngine : public BaseMemEngine
 
     bool allocatePushSpace();
 
+    void deallocatePushSpace(int space);
+
     void recvWLItem(WorkListItem wl);
 
-    void recvWLItemRetry(WorkListItem wl, bool do_push);
+    void recvWLItemRetry(WorkListItem wl);
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);

From e54f3c1c05a637cea9d8385253edd25fdd7e0b78 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 11:36:14 -0700
Subject: [PATCH 114/287] Deleting comments and updating config.

---
 configs/accl/sega.py               | 14 +++++++-------
 src/accl/graph/sega/push_engine.cc | 14 ++------------
 2 files changed, 9 insertions(+), 19 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..15431088d2 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=64,
+                                    resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=4,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                on_the_fly_update_map_size=16)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d493b34c53..e87f4d275e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,16 +215,6 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if ((numRetries > 0) &&
-        //     ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        //     retrySpaceAllocated++;
-        //     DPRINTF(PushEngine, "%s: Allocated 1 space for retry. "
-        //                     "retrySpaceAllocated = %d.\n",
-        //                     __func__, retrySpaceAllocated);
-        //     if ((retrySpaceAllocated % numElementsPerLine) == 0) {
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
         if (numRetries > 0) {
             int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
             DPRINTF(PushEngine, "%s: Found %d free spaces in "
@@ -232,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
             if (free_space > numElementsPerLine) {
                 retrySpaceAllocated += numElementsPerLine;
                 DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__, free_space,
-                        retrySpaceAllocated);
+                        "retrySpaceAllocated = %d.\n", __func__,
+                        numElementsPerLine, retrySpaceAllocated);
                 peerCoalesceEngine->recvPushRetry();
             }
         }

From 5a27472b412574e5f3d02f2be34af319c9e70296 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 14:12:33 -0700
Subject: [PATCH 115/287] Adding a new debug print.

---
 src/accl/graph/sega/coalesce_engine.cc | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dde6e46aa9..e7e528aaf5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -353,6 +353,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 int push_needed = 0;
                 // It is not busy anymore, we have to send the wl from cache.
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -369,6 +371,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     push_needed += needsPush[it + i];
                     needsPush[it + i] = 0;
                 }
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
                 // Since we have just applied the line, we can take it out of
@@ -397,7 +401,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 DPRINTF(CoalesceEngine, "%s: Received read response for retry "
                         "for addr %lu. It was found in the cache as busy.\n",
                         __func__, addr);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             }
         } else {
             // We have read the address to send the wl and it is not in the
@@ -408,6 +416,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem* items = pkt->getPtr<WorkListItem>();
             int push_needed = 0;
             // No applying of the line needed.
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -417,6 +427,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 push_needed += needsPush[it + i];
                 needsPush[it + i] = 0;
             }
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
         }
@@ -740,6 +752,8 @@ CoalesceEngine::processNextSendRetryEvent()
 
     if (hit_in_cache) {
         int push_needed = 0;
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -754,6 +768,8 @@ CoalesceEngine::processNextSendRetryEvent()
             push_needed +=  needsPush[it + i];
             needsPush[it + i] = 0;
         }
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);

From 590c8a8870a475383faf26890c014a85bd9068ec Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 15:51:58 -0700
Subject: [PATCH 116/287] Updating debug flags. Adding one per comp.

---
 configs/accl/sega.py                   | 14 ++--
 src/accl/graph/SConscript              |  4 +-
 src/accl/graph/base/SConscript         |  1 +
 src/accl/graph/base/base_mem_engine.cc |  6 +-
 src/accl/graph/sega/coalesce_engine.cc | 91 +++++++++++++-------------
 src/accl/graph/sega/push_engine.cc     |  9 ++-
 src/accl/graph/sega/wl_engine.cc       | 44 ++++++-------
 7 files changed, 82 insertions(+), 87 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 15431088d2..eb209911be 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=32,
+                                    push_req_queue_size=4,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=64,
-                                    resp_queue_size=64)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=4,
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
                                     outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                on_the_fly_update_map_size=16)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7ca60c30bd..f5f7e962af 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-DebugFlag('MPU')
-# CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine'])
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 4c90dfa9a6..45877a12ca 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -32,3 +32,4 @@ SimObject('BaseReduceEngine.py')
 
 Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
+DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index e05357950b..cb4c1d81bb 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/base/base_mem_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/BaseMemEngine.hh"
 
 namespace gem5
 {
@@ -102,7 +102,7 @@ BaseMemEngine::processNextMemReqEvent()
         PacketPtr pkt = outstandingMemReqQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-        DPRINTF(MPU, "%s: Sent a packet to memory with the following info. "
+        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
         outstandingMemReqQueue.pop_front();
@@ -190,7 +190,7 @@ BaseMemEngine::requestMemRetry(int space) {
     panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
             "You should not request another alarm without the first one being"
             "responded to.\n");
-    DPRINTF(MPU, "%s: Alarm requested with space = %d.\n", __func__, space);
+    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
     memRetryRequested = true;
     memSpaceRequested = space;
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e7e528aaf5..522feebace 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -150,7 +149,7 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHRMap.size() <= numMSHREntry);
-    DPRINTF(MPU, "%s: Received a read request for address: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
@@ -167,7 +166,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(MPU, "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
             "to responseQueue. responseQueue.size = %d.\n",
             __func__, addr, block_index, wl_offset,
             cacheBlocks[block_index].items[wl_offset].to_string(),
@@ -184,28 +183,28 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        DPRINTF(MPU, "%s: Addr: %lu is a miss.\n", __func__, addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu not "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHRMap.size() <= numMSHREntry);
             if (MSHRMap.size() == numMSHREntry) {
                 // Out of MSHR entries
-                DPRINTF(MPU, "%s: Out of MSHR entries. "
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(MPU, "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
-                    DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
@@ -213,13 +212,13 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
                     if (!cacheBlocks[block_index].busyMask) {
                         applyQueue.push_back(block_index);
-                        DPRINTF(MPU, "%s: Added %d to applyQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
                                     block_index, applyQueue.size());
                         assert(!applyQueue.empty());
@@ -231,11 +230,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(MPU, "%s: Addr: %lu has no conflict. Trying to "
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
                     if (memQueueFull()) {
-                        DPRINTF(MPU, "%s: No space in outstandingMemReqQueue. "
+                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
                                     "Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
@@ -245,19 +244,19 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(MPU, "%s: Allocated cache line[%d] for "
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
                                 "Addr: %lu.\n", __func__, block_index, addr);
 
                     MSHRMap[block_index].push_back(addr);
-                    DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
 
                     PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
                     enqueueMemReq(pkt);
-                    DPRINTF(MPU, "%s: Pushed pkt to outstandingMemReqQueue.\n",
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
                                                                     __func__);
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -265,10 +264,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(MPU, "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
             if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(MPU, "%s: Out of targets for cache line[%d]. "
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -276,7 +275,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             if ((!cacheBlocks[block_index].hasConflict) &&
                 (aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(MPU, "%s: Addr: %lu has a conflict "
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
@@ -289,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHRMap[block_index].push_back(addr);
-            DPRINTF(MPU, "%s: Added Addr: %lu to targets for cache "
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
@@ -306,11 +305,11 @@ CoalesceEngine::processNextRespondEvent()
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
     peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(MPU, "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
+    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
-    DPRINTF(MPU, "%s: Popped a response from responseQueue. "
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                 "responseQueue.size = %d.\n", __func__,
                 responseQueue.size());
 
@@ -333,7 +332,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     if (pkt->isWrite()) {
         delete pkt;
-        DPRINTF(MPU, "%s: Received a write response for Addr: %lu. Dropping "
+        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
                     "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
@@ -440,7 +439,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     Addr addr = pkt->getAddr();
     int block_index = (addr / peerMemoryAtomSize) % numLines;
 
-    DPRINTF(MPU, "%s: Received a read resposne for Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
@@ -449,7 +448,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                                 peerMemoryAtomSize);
 
     for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(MPU, "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
                 block_index, i, cacheBlocks[block_index].items[i].to_string());
     }
     cacheBlocks[block_index].valid = true;
@@ -462,13 +461,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(MPU, "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(MPU, "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -477,7 +476,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(MPU, "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
@@ -490,7 +489,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(MPU, "%s: Addr: %lu has been serviced and is removed.\n",
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
@@ -517,7 +516,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
-    DPRINTF(MPU, "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
                 __func__, wl.to_string(), addr);
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
@@ -529,17 +528,17 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(MPU, "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(MPU, "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
                 __func__, block_index, applyQueue.size());
     }
 
@@ -556,12 +555,12 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid apply process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(MPU, "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -596,7 +595,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         evictQueue.push_back(block_index);
-        DPRINTF(MPU, "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
+        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
                 __func__, block_index, evictQueue.size());
     }
 
@@ -621,7 +620,7 @@ CoalesceEngine::processNextEvictEvent()
 
     if ((cacheBlocks[block_index].busyMask) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(MPU, "%s: cache line [%d] has been taken amid evict process. "
+        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseEvictSchedules++;
@@ -630,7 +629,7 @@ CoalesceEngine::processNextEvictEvent()
                         (cacheBlocks[block_index].hasConflict ? 2 : 1) :
                         (cacheBlocks[block_index].hasConflict ? 1 : 0);
         if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(MPU, "%s: There is not enough space in memReqQueue to "
+            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
                     "procees the eviction of cache line [%d]. dirty: %d, "
                     "hasConflict: %d.\n", __func__, block_index,
                     cacheBlocks[block_index].dirty,
@@ -639,12 +638,12 @@ CoalesceEngine::processNextEvictEvent()
             return;
         } else {
             if (cacheBlocks[block_index].dirty) {
-                DPRINTF(MPU, "%s: Change observed on cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
                             __func__, block_index);
                 PacketPtr write_pkt = createWritePacket(
                     cacheBlocks[block_index].addr, peerMemoryAtomSize,
                     (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(MPU, "%s: Created a write packet to Addr: %lu, "
+                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
                             "size = %d.\n", __func__,
                             write_pkt->getAddr(), write_pkt->getSize());
                 enqueueMemReq(write_pkt);
@@ -653,7 +652,7 @@ CoalesceEngine::processNextEvictEvent()
             if (cacheBlocks[block_index].hasConflict) {
                 assert(!MSHRMap[block_index].empty());
                 Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(MPU, "%s: First conflicting address for cache line[%d]"
+                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
                         " is Addr: %lu.\n", __func__, block_index, miss_addr);
 
                 Addr aligned_miss_addr =
@@ -661,7 +660,7 @@ CoalesceEngine::processNextEvictEvent()
 
                 PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
                                                         peerMemoryAtomSize);
-                DPRINTF(MPU, "%s: Created a read packet for Addr: %lu."
+                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                             " req addr (aligned_addr) = %lu, size = %d.\n",
                             __func__, miss_addr,
                             read_pkt->getAddr(), read_pkt->getSize());
@@ -673,7 +672,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = true;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Allocated cache line [%d] for Addr: %lu.\n",
+                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
                             __func__, block_index, aligned_miss_addr);
             } else {
 
@@ -683,7 +682,7 @@ CoalesceEngine::processNextEvictEvent()
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].hasConflict = false;
                 cacheBlocks[block_index].dirty = false;
-                DPRINTF(MPU, "%s: Deallocated cache line [%d].\n",
+                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
                             __func__, block_index);
             }
         }
@@ -709,7 +708,7 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    DPRINTF(MPU, "%s: Received a push retry.\n", __func__);
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
     int it = 0;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index e87f4d275e..f17619942b 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
-#include "debug/MPU.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -91,14 +90,14 @@ PushEngine::ReqPort::recvReqRetry()
 {
     panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
 
-    DPRINTF(MPU, "%s: Received a reqRetry.\n", __func__);
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
     sendPacket(blockedPacket);
 
     if (!_blocked) {
         blockedPacket = nullptr;
-        DPRINTF(MPU, "%s: Sent the blockedPacket. "
+        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
                     "_blocked: %s, (blockedPacket == nullptr): %s.\n",
                     __func__, _blocked ? "true" : "false",
                     (blockedPacket == nullptr) ? "true" : "false");
@@ -273,7 +272,7 @@ PushEngine::processNextPushEvent()
     assert(offset < peerMemoryAtomSize);
     uint32_t value = reqValueMap[pkt->req];
 
-    DPRINTF(MPU, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
+    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
                 "offset: %lu\n",
             __func__, pkt->getAddr(), offset);
 
@@ -287,7 +286,7 @@ PushEngine::processNextPushEvent()
     if (!reqPort.blocked()) {
         reqPort.sendPacket(update);
         stats.numUpdates++;
-        DPRINTF(MPU, "%s: Sent a push update to addr: %lu with value: %d.\n",
+        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
                                 __func__, curr_edge->neighbor, update_value);
         reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
         assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 27ba5c40c8..9d4fb9cbe9 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,7 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
-#include "debug/MPU.hh"
+#include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -73,7 +73,7 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        DPRINTF(MPU, "%s: Sending a RetryReq.\n", __func__);
+        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
         sendRetryReq();
         needSendRetryReq = false;
     }
@@ -129,45 +129,38 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(MPU, "%s: Looking at the front of the updateQueue. Addr: %lu, "
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
                 "value: %u.\n", __func__, update_addr, update_value);
 
     if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(MPU, "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
+        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            DPRINTF(MPU, "%s: Entry available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n",
-                        __func__, onTheFlyUpdateMap.size());
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (coalesceEngine->recvReadAddr(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(MPU, "%s: Added a new item to onTheFlyUpdateMap. "
+                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
                             update_addr, onTheFlyUpdateMap[update_addr]);
                 updateQueue.pop_front();
-                DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                             ". updateQueue.size = %u.\n",
                             __func__, updateQueue.size());
                 respPort.checkRetryReq();
             }
-        } else {
-            DPRINTF(MPU, "%s: No entries available in onTheFlyUpdateMap. "
-                        "onTheFlyUpdateMap.size: %lu.\n", __func__,
-                        onTheFlyUpdateMap.size());
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(MPU, "%s: Found the addr: %lu in onTheFlyUpdateMap. "
+        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
                     "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
                     update_addr, onTheFlyUpdateMap[update_addr]);
         onTheFlyUpdateMap[update_addr] =
                 std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(MPU, "%s: Reduced the update_value with the entry in "
+        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
                     "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
                     __func__, update_addr, onTheFlyUpdateMap[update_addr]);
         stats.onTheFlyCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(MPU, "%s: Popped an item from the front of updateQueue"
+        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
         respPort.checkRetryReq();
@@ -185,7 +178,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
 
     addrWorkListMap[addr] = wl;
-    DPRINTF(MPU, "%s: Received a WorkListItem from the coalesceEngine. Adding"
+    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
                 " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
                 __func__, addr, wl.to_string());
 
@@ -202,7 +195,7 @@ WLEngine::processNextReduceEvent()
         Addr addr = it.first;
         assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
         uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(MPU, "%s: Reducing between onTheFlyUpdateMap and "
+        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
                     "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
                     "addrWorkListMap[%lu] = %s.\n", __func__,
                                 addr, onTheFlyUpdateMap[addr],
@@ -210,15 +203,14 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         addrWorkListMap[addr].tempProp =
                     std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(MPU, "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
+        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
                     __func__, addr, addrWorkListMap[addr].to_string());
         stats.numReduce++;
 
         coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
         onTheFlyUpdateMap.erase(addr);
-        DPRINTF(MPU, "%s: Erased addr: %lu from onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap.size: %lu.\n",
-                    __func__, addr, onTheFlyUpdateMap.size());
+        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
+                    __func__, addr);
     }
     addrWorkListMap.clear();
 }
@@ -231,8 +223,12 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
+    if (curTick() == ) {
+        std
+    }
+
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(MPU, "%s: Pushed an item to the back of updateQueue"
+    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",
                                         __func__, updateQueue.size());
     delete pkt;

From be1246d97085c07ab86fc888111b9cdb8b6b30ea Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 20 Jul 2022 16:11:12 -0700
Subject: [PATCH 117/287] Removing accidentally commented out wrong code.

---
 src/accl/graph/sega/wl_engine.cc | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9d4fb9cbe9..70a921c48a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -223,10 +223,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    if (curTick() == ) {
-        std
-    }
-
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
     DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
                                         ". updateQueue.size = %u.\n",

From c9458f184ad39f8f147bb18a9f3e29f2ecb90ec1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 14:23:35 -0700
Subject: [PATCH 118/287] Adding in between counter for retry.

---
 src/accl/graph/sega/push_engine.cc | 59 +++++++++++++++++++++---------
 src/accl/graph/sega/push_engine.hh |  5 ++-
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 3 files changed, 45 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index f17619942b..0c2b3deb3f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -37,11 +37,10 @@ namespace gem5
 
 PushEngine::PushEngine(const PushEngineParams &params):
     BaseMemEngine(params),
-    retrySpaceAllocated(0),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
-    numRetries(0),
     pushReqQueueSize(params.push_req_queue_size),
+    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -118,16 +117,28 @@ PushEngine::deallocatePushSpace(int space)
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    if ((numRetries > 0)  &&
-        ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-        assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    // if ((numRetries > 0)  &&
+    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
+    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
+    //     int free_space =
+    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+    //     if (free_space > numElementsPerLine) {
+    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
+    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
+    //                 retrySpaceAllocated);
+    //         retrySpaceAllocated += numElementsPerLine;
+    //         peerCoalesceEngine->recvPushRetry();
+    //     }
+    // }
+
+    if (numRetries > 0) {
         int free_space =
             pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        if (free_space > numElementsPerLine) {
-            DPRINTF(PushEngine, "%s: Found %d free spaces. "
-                    "retrySpaceAllocated = %d.\n", __func__, free_space,
-                    retrySpaceAllocated);
-            retrySpaceAllocated += numElementsPerLine;
+        assert(free_space <= numElementsPerLine);
+        retrySpaceAllocated += free_space;
+        spacesAllocatedBetweenRetries += free_space;
+        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
+            spacesAllocatedBetweenRetries %= numElementsPerLine;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -214,15 +225,26 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
+        // if (numRetries > 0) {
+        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
+        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
+        //                     "the pushReqQueue.\n", __func__, free_space);
+        //     if (free_space > numElementsPerLine) {
+        //         retrySpaceAllocated += numElementsPerLine;
+        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
+        //                 "retrySpaceAllocated = %d.\n", __func__,
+        //                 numElementsPerLine, retrySpaceAllocated);
+        //         peerCoalesceEngine->recvPushRetry();
+        //     }
+        // }
+
         if (numRetries > 0) {
-            int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-            DPRINTF(PushEngine, "%s: Found %d free spaces in "
-                            "the pushReqQueue.\n", __func__, free_space);
-            if (free_space > numElementsPerLine) {
-                retrySpaceAllocated += numElementsPerLine;
-                DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-                        "retrySpaceAllocated = %d.\n", __func__,
-                        numElementsPerLine, retrySpaceAllocated);
+            retrySpaceAllocated++;
+            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
+                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
+            spacesAllocatedBetweenRetries++;
+            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
+                spacesAllocatedBetweenRetries = 0;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -331,6 +353,7 @@ PushEngine::allocatePushSpace() {
     assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
         ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
+        assert(numRetries == 0);
         return true;
     } else {
         numRetries++;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9025ae9946..cd79139bbc 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -99,15 +99,16 @@ class PushEngine : public BaseMemEngine
     };
 
     int numElementsPerLine;
-    int retrySpaceAllocated;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
     Addr baseEdgeAddr;
 
-    int numRetries;
     int pushReqQueueSize;
+    int numRetries;
+    int retrySpaceAllocated;
+    int spacesAllocatedBetweenRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 70a921c48a..79bf046ba3 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -136,7 +136,7 @@ WLEngine::processNextReadEvent()
         DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
                     __func__, update_addr);
         if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
-            if (coalesceEngine->recvReadAddr(update_addr)) {
+            if (coalesceEngine->recvWLRead(update_addr)) {
                 onTheFlyUpdateMap[update_addr] = update_value;
                 DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
                             "onTheFlyUpdateMap[%lu] = %u.\n", __func__,

From cb3169882f5dd404f87f533f104d1fa346da30f1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 21 Jul 2022 23:24:32 -0700
Subject: [PATCH 119/287] Fixing the retry mechanism.

---
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++--
 src/accl/graph/sega/push_engine.cc     | 89 +++++++++-----------------
 src/accl/graph/sega/push_engine.hh     |  9 ++-
 3 files changed, 55 insertions(+), 64 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 522feebace..b3167a0e95 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -354,6 +354,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 // It is not busy anymore, we have to send the wl from cache.
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 for (int i = 0; i < numElementsPerLine; i++) {
                     assert(!((needsPush[it + i] == 1) &&
                             (cacheBlocks[block_index].items[i].degree == 0)));
@@ -374,6 +375,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
                 peerPushEngine->deallocatePushSpace(
                                         numElementsPerLine - push_needed);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 // Since we have just applied the line, we can take it out of
                 // the applyQueue if it's in there. No need to do the same
                 // thing for evictQueue.
@@ -402,7 +404,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         __func__, addr);
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 peerPushEngine->deallocatePushSpace(numElementsPerLine);
+                assert(peerPushEngine->getNumRetries() == needsPush.count());
                 DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
             }
@@ -417,6 +421,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // No applying of the line needed.
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
             for (int i = 0; i < numElementsPerLine; i++) {
                 assert(!((needsPush[it + i] == 1) &&
                                 (items[i].degree == 0)));
@@ -430,6 +435,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 __func__, needsPush.count());
             peerPushEngine->deallocatePushSpace(
                                     numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
         }
 
         delete pkt;
@@ -708,6 +714,13 @@ CoalesceEngine::recvPushRetry()
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
+    if (needsPush.count() == 0) {
+        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+                        "bit in needsPush. Rejecting the retry.\n", __func__);
+        peerPushEngine->recvRetryReject();
+        return;
+    }
+
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
     int block_index = 0;
@@ -715,7 +728,8 @@ CoalesceEngine::processNextSendRetryEvent()
     uint32_t slice = 0;
     bool hit_in_cache = false;
 
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
         for (int i = 0; i < numElementsPerLine; i++) {
             slice <<= 1;
             slice |= needsPush[it + i];
@@ -734,9 +748,6 @@ CoalesceEngine::processNextSendRetryEvent()
                 break;
             }
         }
-        if (it == (MAX_BITVECTOR_SIZE - numElementsPerLine)) {
-            it = 0;
-        }
     }
 
     assert(it < MAX_BITVECTOR_SIZE);
@@ -753,6 +764,7 @@ CoalesceEngine::processNextSendRetryEvent()
         int push_needed = 0;
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             // TODO: Make this more programmable
             uint32_t new_prop = std::min(
@@ -770,6 +782,7 @@ CoalesceEngine::processNextSendRetryEvent()
         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                 __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
         if (applyQueue.find(block_index)) {
             applyQueue.erase(block_index);
             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0c2b3deb3f..6db91734fe 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -40,7 +40,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
-    numRetries(0), retrySpaceAllocated(0), spacesAllocatedBetweenRetries(0),
+    numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     stats(*this)
@@ -106,39 +106,22 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::deallocatePushSpace(int space)
 {
-    retrySpaceAllocated -= space;
-    DPRINTF(PushEngine, "%s: Deallocated %d spaces. numRetries = %d, "
-            "nextAddrGenEvent.scheduled() = %s, pendingMemRetry() = %s, "
-            "pushReqQueue.size() = %d, retrySpaceAllocated = %d.\n",
-            __func__, space, numRetries,
-            nextAddrGenEvent.scheduled() ? "true" : "false",
-            pendingMemRetry() ? "true" : "false",
-            pushReqQueue.size(), retrySpaceAllocated);
     /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
     // and or the pushReqQueue is empty. If so we might need to
     // send retries.
-    // if ((numRetries > 0)  &&
-    //     ((pushReqQueue.size() + retrySpaceAllocated) == 0)) {
-    //     assert((!pendingMemRetry()) && (!nextAddrGenEvent.scheduled()));
-    //     int free_space =
-    //         pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-    //     if (free_space > numElementsPerLine) {
-    //         DPRINTF(PushEngine, "%s: Found %d free spaces. "
-    //                 "retrySpaceAllocated = %d.\n", __func__, free_space,
-    //                 retrySpaceAllocated);
-    //         retrySpaceAllocated += numElementsPerLine;
-    //         peerCoalesceEngine->recvPushRetry();
-    //     }
-    // }
-
-    if (numRetries > 0) {
-        int free_space =
-            pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        assert(free_space <= numElementsPerLine);
-        retrySpaceAllocated += free_space;
-        spacesAllocatedBetweenRetries += free_space;
-        if (spacesAllocatedBetweenRetries >= numElementsPerLine) {
-            spacesAllocatedBetweenRetries %= numElementsPerLine;
+    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
+                                                __func__, space);
+    numPendingRetries--;
+    if (numTotalRetries > 0) {
+        int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
+                            "free spaces.\n", __func__, free_space);
+        if ((free_space > numElementsPerLine) &&
+            (numTotalRetries >= numPendingRetries)) {
+            DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+            numPendingRetries++;
             peerCoalesceEngine->recvPushRetry();
         }
     }
@@ -162,6 +145,8 @@ PushEngine::recvWLItem(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
 
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
@@ -187,8 +172,10 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
-    numRetries--;
-    retrySpaceAllocated--;
+    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
+                            __func__, pushReqQueue.size());
+
+    numTotalRetries--;
     if ((!nextAddrGenEvent.scheduled())) {
         if (memQueueFull()) {
             if (!pendingMemRetry()) {
@@ -225,26 +212,16 @@ PushEngine::processNextAddrGenEvent()
         DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                     "pushReqQueue.size() = %u.\n",
                     __func__, pushReqQueue.size());
-        // if (numRetries > 0) {
-        //     int free_space = pushReqQueueSize - (pushReqQueue.size() + retrySpaceAllocated);
-        //     DPRINTF(PushEngine, "%s: Found %d free spaces in "
-        //                     "the pushReqQueue.\n", __func__, free_space);
-        //     if (free_space > numElementsPerLine) {
-        //         retrySpaceAllocated += numElementsPerLine;
-        //         DPRINTF(PushEngine, "%s: Allocated %d spaces for retry. "
-        //                 "retrySpaceAllocated = %d.\n", __func__,
-        //                 numElementsPerLine, retrySpaceAllocated);
-        //         peerCoalesceEngine->recvPushRetry();
-        //     }
-        // }
-
-        if (numRetries > 0) {
-            retrySpaceAllocated++;
-            DPRINTF(PushEngine, "%s: Allocated one space for retry. "
-                "retrySpaceAllocated = %d.\n", __func__, retrySpaceAllocated);
-            spacesAllocatedBetweenRetries++;
-            if (spacesAllocatedBetweenRetries == numElementsPerLine) {
-                spacesAllocatedBetweenRetries = 0;
+        if (numTotalRetries > 0) {
+            int free_space = pushReqQueueSize -
+            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                        "free spaces.\n", __func__, free_space);
+            if ((free_space > numElementsPerLine) &&
+                (numTotalRetries >= numPendingRetries)) {
+                DPRINTF(PushEngine, "%s: Sent a push retry to "
+                            "peerCoalesceEngine.\n", __func__);
+                numPendingRetries++;
                 peerCoalesceEngine->recvPushRetry();
             }
         }
@@ -350,13 +327,11 @@ PushEngine::createUpdatePacket(Addr addr, T value)
 
 bool
 PushEngine::allocatePushSpace() {
-    assert(retrySpaceAllocated >= 0);
     if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() + retrySpaceAllocated) < pushReqQueueSize)) {
-        assert(numRetries == 0);
+        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
         return true;
     } else {
-        numRetries++;
+        numTotalRetries++;
         return false;
     }
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cd79139bbc..a3a308554f 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -106,9 +106,8 @@ class PushEngine : public BaseMemEngine
     Addr baseEdgeAddr;
 
     int pushReqQueueSize;
-    int numRetries;
-    int retrySpaceAllocated;
-    int spacesAllocatedBetweenRetries;
+    int numTotalRetries;
+    int numPendingRetries;
     std::deque<PushPacketInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
@@ -164,6 +163,10 @@ class PushEngine : public BaseMemEngine
 
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
+
+    int getNumRetries() { return numTotalRetries; }
+
+    void recvRetryReject() { numPendingRetries--; }
 };
 
 }

From c03a23a38717d7dd123bb92b0a55bb048e53545f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 15:59:31 -0700
Subject: [PATCH 120/287] Limiting retries to one.

---
 src/accl/graph/sega/push_engine.cc | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6db91734fe..ab2962b253 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -118,7 +118,7 @@ PushEngine::deallocatePushSpace(int space)
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
         if ((free_space > numElementsPerLine) &&
-            (numTotalRetries >= numPendingRetries)) {
+            (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
             numPendingRetries++;
@@ -218,7 +218,7 @@ PushEngine::processNextAddrGenEvent()
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
             if ((free_space > numElementsPerLine) &&
-                (numTotalRetries >= numPendingRetries)) {
+                (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
                 numPendingRetries++;

From dcfaab330d517c1b02c8aaa882336698d1a29de6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 17:28:51 -0700
Subject: [PATCH 121/287] Adding MemoryEvent class and nextReadOnMissEvent.

---
 src/accl/graph/sega/coalesce_engine.cc | 42 +++++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh | 21 +++++++++++--
 2 files changed, 54 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b3167a0e95..033c1f3363 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,6 +48,7 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     evictQueue(numLines),
+    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextEvictEvent([this] { processNextEvictEvent(); }, name()),
@@ -175,7 +176,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        assert(!responseQueue.empty());
         if (!nextRespondEvent.scheduled()) {
             schedule(nextRespondEvent, nextCycle());
         }
@@ -233,9 +233,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
                                 "allocate a cache line for it.\n",
                                 __func__, addr);
-                    if (memQueueFull()) {
-                        DPRINTF(CoalesceEngine,  "%s: No space in outstandingMemReqQueue. "
-                                    "Rejecting  request.\n", __func__);
+                    if (lineFillBuffer.size() == numMSHREntry) {
+                        DPRINTF(CoalesceEngine,  "%s: No space left in "
+                            "lineFillBuffer. Rejecting  request.\n", __func__);
                         stats.readRejections++;
                         return false;
                     }
@@ -255,9 +255,15 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
                                 " req addr (aligned_addr) = %lu, size = %d.\n",
                                 __func__, addr, aligned_addr, peerMemoryAtomSize);
-                    enqueueMemReq(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to outstandingMemReqQueue.\n",
-                                                                    __func__);
+                    // enqueueMemReq(pkt);
+                    lineFillBuffer.push_back(pkt);
+                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
+                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                            __func__, lineFillBuffer.size());
+                    if ((!nextReadOnMissEvent.pending()) &&
+                        (!nextReadOnMissEvent.scheduled())) {
+                        schedule(nextReadOnMissEvent, nextCycle());
+                    }
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -296,6 +302,28 @@ CoalesceEngine::recvWLRead(Addr addr)
     }
 }
 
+void
+CoalesceEngine::processNextReadOnMissEvent()
+{
+    if (memQueueFull()) {
+        nextReadOnMissEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
+
+    PacketPtr pkt = lineFillBuffer.front();
+    enqueueMemReq(pkt);
+
+    lineFillBuffer.pop_front();
+
+    if (!lineFillBuffer.empty()) {
+        assert(!nextReadOnMissEvent.scheduled());
+        assert(!nextReadOnMissEvent.pending());
+        schedule(nextReadOnMissEvent, nextCycle());
+    }
+}
+
 // TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextRespondEvent()
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e1033a4622..05fa555ec8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -49,6 +49,20 @@ class WLEngine;
 class CoalesceEngine : public BaseMemEngine
 {
   private:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     struct Block
     {
         WorkListItem* items;
@@ -93,7 +107,7 @@ class CoalesceEngine : public BaseMemEngine
     int numMSHREntry;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHRMap;
-
+    std::deque<PacketPtr> lineFillBuffer;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
@@ -107,13 +121,16 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    MemoryEvent nextReadOnMissEvent;
+    void processNextReadOnMissEvent();
+
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    EventFunctionWrapper nextEvictEvent;
+    MemoryEvent nextEvictEvent;
     void processNextEvictEvent();
 
     EventFunctionWrapper nextSendRetryEvent;

From 7db47e2a89611412310f3f50e32df6433a429af4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 22:04:08 -0700
Subject: [PATCH 122/287] Restructuring events and adding nextWriteBackEvent.

---
 src/accl/graph/base/data_structs.hh    |   4 +-
 src/accl/graph/sega/coalesce_engine.cc | 290 ++++++++++++-------------
 src/accl/graph/sega/coalesce_engine.hh |  21 +-
 src/accl/graph/sega/push_engine.cc     |   4 +-
 4 files changed, 153 insertions(+), 166 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f938be72f1..f178d5a7e2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,13 +90,13 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class FIFOSet
+class InOutSet
 {
   private:
     std::unordered_set<T> set;
 
   public:
-    FIFOSet(int cap)
+    InOutSet(int cap)
     {
         set.reserve(cap);
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 033c1f3363..ddbd22a8b5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -42,16 +42,17 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntry(params.num_mshr_entry),
+    numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     currentBitSliceIndex(0),
     numRetriesReceived(0),
     applyQueue(numLines),
-    evictQueue(numLines),
-    nextReadOnMissEvent([this] { processNextReadOnMissEvent(); }, name()),
+    writeBackQueue(numLines),
+    replaceQueue(numLines),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextEvictEvent([this] { processNextEvictEvent(); }, name()),
+    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {
@@ -149,7 +150,7 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHRMap.size() <= numMSHREntry);
+    assert(MSHR.size() <= numMSHREntries);
     DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
@@ -184,11 +185,11 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHRMap.find(block_index) == MSHRMap.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu not "
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
                         "found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHRMap.size() <= numMSHREntry);
-            if (MSHRMap.size() == numMSHREntry) {
+            assert(MSHR.size() <= numMSHREntries);
+            if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
                             "Rejecting request.\n", __func__);
@@ -199,24 +200,26 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
-                    assert(MSHRMap[block_index].size() <= numTgtsPerMSHR);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                                     "Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
-                    MSHRMap[block_index].push_back(addr);
+                    MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                                 "line[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-                    if (!cacheBlocks[block_index].busyMask) {
+
+                    if ((cacheBlocks[block_index].busyMask == 0) &&
+                        (cacheBlocks[block_index].valid)) {
                         applyQueue.push_back(block_index);
                         DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
                                     "applyQueue.size = %u.\n", __func__,
@@ -230,39 +233,31 @@ CoalesceEngine::recvWLRead(Addr addr)
                 } else {
                     assert(!cacheBlocks[block_index].valid);
                     // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. Trying to "
-                                "allocate a cache line for it.\n",
-                                __func__, addr);
-                    if (lineFillBuffer.size() == numMSHREntry) {
-                        DPRINTF(CoalesceEngine,  "%s: No space left in "
-                            "lineFillBuffer. Rejecting  request.\n", __func__);
-                        stats.readRejections++;
-                        return false;
-                    }
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cache line[%d] for "
-                                "Addr: %lu.\n", __func__, block_index, addr);
+                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
 
-                    MSHRMap[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    MSHR[block_index].push_back(addr);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                                " req addr (aligned_addr) = %lu, size = %d.\n",
-                                __func__, addr, aligned_addr, peerMemoryAtomSize);
                     // enqueueMemReq(pkt);
-                    lineFillBuffer.push_back(pkt);
-                    DPRINTF(CoalesceEngine,  "%s: Pushed pkt to "
-                            "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                            __func__, lineFillBuffer.size());
-                    if ((!nextReadOnMissEvent.pending()) &&
-                        (!nextReadOnMissEvent.scheduled())) {
-                        schedule(nextReadOnMissEvent, nextCycle());
+                    fillQueue.push_back(block_index);
+                    // FIXME: Fix this DPRINTF
+                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
+                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
+                    //         __func__, fillQueue.size());
+                    if ((!nextMemoryReadEvent.pending()) &&
+                        (!nextMemoryReadEvent.scheduled())) {
+                        schedule(nextMemoryReadEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -270,10 +265,10 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cache line[%d] for Addr: %lu already "
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
                         "in MSHRs.\n", __func__, block_index, addr);
-            if (MSHRMap[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cache line[%d]. "
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
                             "Rejecting request.\n",
                             __func__, block_index);
                 stats.readRejections++;
@@ -293,7 +288,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readHitUnderMisses++;
             }
 
-            MSHRMap[block_index].push_back(addr);
+            MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
                             "line[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -303,24 +298,29 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextReadOnMissEvent()
+CoalesceEngine::processNextMemoryReadEvent()
 {
     if (memQueueFull()) {
-        nextReadOnMissEvent.sleep();
+        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         return;
     }
 
-    PacketPtr pkt = lineFillBuffer.front();
+    int block_index = fillQueue.front();
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
     enqueueMemReq(pkt);
 
-    lineFillBuffer.pop_front();
+    fillQueue.pop_front();
 
-    if (!lineFillBuffer.empty()) {
-        assert(!nextReadOnMissEvent.scheduled());
-        assert(!nextReadOnMissEvent.pending());
-        schedule(nextReadOnMissEvent, nextCycle());
+    if (!fillQueue.empty()) {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(!nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -347,11 +347,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
+// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!nextEvictEvent.scheduled());
-    schedule(nextEvictEvent, nextCycle());
+    // assert(!nextEvictEvent.scheduled());
+    // schedule(nextEvictEvent, nextCycle());
+    return;
 }
 
 bool
@@ -413,10 +415,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        evictQueue.push_back(block_index);
-                        if ((!nextEvictEvent.scheduled()) &&
-                            (!pendingMemRetry())) {
-                            schedule(nextEvictEvent, nextCycle());
+                        writeBackQueue.push_back(block_index);
+                        if ((!nextWriteBackEvent.pending()) &&
+                            (!nextWriteBackEvent.scheduled())) {
+                            schedule(nextWriteBackEvent, nextCycle());
                         }
                     }
                 }
@@ -477,7 +479,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 __func__, pkt->getAddr());
     assert((cacheBlocks[block_index].allocated) && // allocated cache block
             (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHRMap.find(block_index) == MSHRMap.end()))); // allocated MSHR
+            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -490,18 +492,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHRMap[block_index].size(); i++) {
-        Addr miss_addr = MSHRMap[block_index][i];
+    for (int i = 0; i < MSHR[block_index].size(); i++) {
+        Addr miss_addr = MSHR[block_index][i];
         Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cache line[%d] could "
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
                         "be serviced with the received packet.\n",
                         __func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cache line[%d][%d] to "
+            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
                     "responseQueue. responseQueue.size = %u.\n"
                     , __func__, block_index, wl_offset,
                     responseQueue.size());
@@ -510,25 +512,25 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // End of the said block
 
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cache line[%d] for "
+            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
                         "removal.\n", __func__, i, block_index);
         }
     }
 
     // TODO: We Can use taken instead of this
-    // TODO: Change the MSHRMap from map<Addr, vector> to map<Addr, list>
+    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
     int bias = 0;
     for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHRMap[block_index][i - bias];
-        MSHRMap[block_index].erase(MSHRMap[block_index].begin() +
+        Addr print_addr = MSHR[block_index][i - bias];
+        MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
                     __func__, print_addr);
     }
 
-    if (MSHRMap[block_index].empty()) {
-        MSHRMap.erase(block_index);
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
         cacheBlocks[block_index].hasConflict = false;
     } else {
         assert(cacheBlocks[block_index].hasConflict);
@@ -562,13 +564,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cache line[%d][%d] = %s.\n",
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cache line[%d]."
+        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
                     " It does not have any taken items anymore.\n",
                     __func__, block_index);
         applyQueue.push_back(block_index);
@@ -588,13 +590,13 @@ CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
 
-    if (cacheBlocks[block_index].busyMask) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid apply process. "
+    if (cacheBlocks[block_index].busyMask != 0) {
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
                     "Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has no change. Therefore, no apply "
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
                     "needed.\n", __func__, block_index);
     } else {
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -628,17 +630,17 @@ CoalesceEngine::processNextApplyEvent()
 
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
-        evictQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to evictQueue. evictQueue.size = %u.\n",
-                __func__, block_index, evictQueue.size());
+        writeBackQueue.push_back(block_index);
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
+                __func__, block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!pendingMemRetry()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if ((!writeBackQueue.empty()) &&
+        (!nextWriteBackEvent.pending()) &&
+        (!nextWriteBackEvent.scheduled())) {
+        schedule(nextWriteBackEvent, nextCycle());
     }
 
     if ((!applyQueue.empty()) &&
@@ -648,85 +650,64 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextEvictEvent()
+CoalesceEngine::processNextWriteBackEvent()
 {
-    int block_index = evictQueue.front();
+    if (memQueueFull()) {
+        nextWriteBackEvent.sleep();
+        // TODO: Implement interface where events of the CoalesceEngine are
+        // pushed to a fifo to be scheduled later.
+        return;
+    }
 
-    if ((cacheBlocks[block_index].busyMask) ||
+    int block_index = writeBackQueue.front();
+
+    // Why would we write it back if it does not have a conflict?
+    assert(cacheBlocks[block_index].hasConflict);
+
+    if ((cacheBlocks[block_index].busyMask != 0) ||
         (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cache line [%d] has been taken amid evict process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                "writeback process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
+        // FIXME: Fix the name of this stat.
         stats.falseEvictSchedules++;
     } else {
-        int space_needed = cacheBlocks[block_index].dirty ?
-                        (cacheBlocks[block_index].hasConflict ? 2 : 1) :
-                        (cacheBlocks[block_index].hasConflict ? 1 : 0);
-        if (!allocateMemQueueSpace(space_needed)) {
-            DPRINTF(CoalesceEngine,  "%s: There is not enough space in memReqQueue to "
-                    "procees the eviction of cache line [%d]. dirty: %d, "
-                    "hasConflict: %d.\n", __func__, block_index,
-                    cacheBlocks[block_index].dirty,
-                    cacheBlocks[block_index].hasConflict);
-            requestMemRetry(space_needed);
-            return;
-        } else {
-            if (cacheBlocks[block_index].dirty) {
-                DPRINTF(CoalesceEngine,  "%s: Change observed on cache line [%d].\n",
-                            __func__, block_index);
-                PacketPtr write_pkt = createWritePacket(
-                    cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                    (uint8_t*) cacheBlocks[block_index].items);
-                DPRINTF(CoalesceEngine,  "%s: Created a write packet to Addr: %lu, "
-                            "size = %d.\n", __func__,
-                            write_pkt->getAddr(), write_pkt->getSize());
-                enqueueMemReq(write_pkt);
-            }
-
-            if (cacheBlocks[block_index].hasConflict) {
-                assert(!MSHRMap[block_index].empty());
-                Addr miss_addr = MSHRMap[block_index].front();
-                DPRINTF(CoalesceEngine,  "%s: First conflicting address for cache line[%d]"
-                        " is Addr: %lu.\n", __func__, block_index, miss_addr);
-
-                Addr aligned_miss_addr =
-                    roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
-
-                PacketPtr read_pkt = createReadPacket(aligned_miss_addr,
-                                                        peerMemoryAtomSize);
-                DPRINTF(CoalesceEngine,  "%s: Created a read packet for Addr: %lu."
-                            " req addr (aligned_addr) = %lu, size = %d.\n",
-                            __func__, miss_addr,
-                            read_pkt->getAddr(), read_pkt->getSize());
-                enqueueMemReq(read_pkt);
-
-                cacheBlocks[block_index].addr = aligned_miss_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = true;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = true;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Allocated cache line [%d] for Addr: %lu.\n",
-                            __func__, block_index, aligned_miss_addr);
-            } else {
-
-                // Since allocated is false, does not matter what the address is.
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].allocated = false;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].dirty = false;
-                DPRINTF(CoalesceEngine,  "%s: Deallocated cache line [%d].\n",
-                            __func__, block_index);
-            }
+        if (cacheBlocks[block_index].dirty) {
+            DPRINTF(CoalesceEngine,  "%s: Change observed on "
+                    "cacheBlocks[%d].\n", __func__, block_index);
+            PacketPtr write_pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        write_pkt->getAddr(), write_pkt->getSize());
+            enqueueMemReq(write_pkt);
         }
+        assert(!MSHR[block_index].empty());
+        Addr miss_addr = MSHR[block_index].front();
+        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
+                                    "cacheBlocks[%d] is Addr: %lu.\n",
+                                    __func__, block_index, miss_addr);
+        Addr aligned_miss_addr =
+            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].allocated = true;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].hasConflict = true;
+        cacheBlocks[block_index].dirty = false;
+        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
+                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
+        fillQueue.push_back(block_index);
     }
 
-    evictQueue.pop_front();
+    writeBackQueue.pop_front();
 
-    if ((!evictQueue.empty()) &&
-        (!nextEvictEvent.scheduled())) {
-        schedule(nextEvictEvent, nextCycle());
+    if (!writeBackQueue.empty()) {
+        assert(!nextWriteBackEvent.pending());
+        assert(!nextWriteBackEvent.scheduled());
+        schedule(nextWriteBackEvent, nextCycle());
     }
 }
 
@@ -817,10 +798,11 @@ CoalesceEngine::processNextSendRetryEvent()
                 deschedule(nextApplyEvent);
             }
             if (cacheBlocks[block_index].hasConflict) {
-                evictQueue.push_back(block_index);
-                if ((!nextEvictEvent.scheduled()) &&
-                    (!pendingMemRetry())) {
-                    schedule(nextEvictEvent, nextCycle());
+                writeBackQueue.push_back(block_index);
+                if ((!writeBackQueue.empty()) &&
+                    (!nextWriteBackEvent.pending()) &&
+                    (!nextWriteBackEvent.scheduled())) {
+                    schedule(nextWriteBackEvent, nextCycle());
                 }
             }
         }
@@ -829,6 +811,8 @@ CoalesceEngine::processNextSendRetryEvent()
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
         // queueing in the outstandingMemReqQueue.
+        // FIXME: Also do not send requests for cache lines that are already
+        // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05fa555ec8..563fa671b3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -104,25 +104,28 @@ class CoalesceEngine : public BaseMemEngine
     int numLines;
     int numElementsPerLine;
 
-    int numMSHREntry;
+    int numMSHREntries;
     int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHRMap;
-    std::deque<PacketPtr> lineFillBuffer;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    std::deque<int> fillQueue;
+
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int currentBitSliceIndex;
     int numRetriesReceived;
-    FIFOSet<int> applyQueue;
+    InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    FIFOSet<int> evictQueue;
+    InOutSet<int> writeBackQueue;
+    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
-    MemoryEvent nextReadOnMissEvent;
-    void processNextReadOnMissEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -130,8 +133,8 @@ class CoalesceEngine : public BaseMemEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextEvictEvent;
-    void processNextEvictEvent();
+    MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent();
 
     EventFunctionWrapper nextSendRetryEvent;
     void processNextSendRetryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index ab2962b253..5ab8db401c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -117,7 +117,7 @@ PushEngine::deallocatePushSpace(int space)
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
         DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
                             "free spaces.\n", __func__, free_space);
-        if ((free_space > numElementsPerLine) &&
+        if ((free_space >= numElementsPerLine) &&
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
                         "free spaces.\n", __func__, free_space);
-            if ((free_space > numElementsPerLine) &&
+            if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);

From e0f5242c06f12b799b76455d0b95ba90e6238e74 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 23 Jul 2022 23:57:58 -0700
Subject: [PATCH 123/287] Implemented MemoryEvent retry mechanism.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 87 ++++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  5 +-
 src/accl/graph/sega/push_engine.cc     | 17 +++--
 src/accl/graph/sega/push_engine.hh     |  3 +
 5 files changed, 88 insertions(+), 26 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index eb209911be..ffd74241e7 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -19,7 +19,7 @@ def __init__(self, base_edge_addr):
                                     cache_size="1MiB",
                                     num_mshr_entry=1,
                                     num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    outstanding_mem_req_queue_size=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
                                 on_the_fly_update_map_size=1)
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ddbd22a8b5..4a0600e9c0 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -300,10 +300,16 @@ CoalesceEngine::recvWLRead(Addr addr)
 void
 CoalesceEngine::processNextMemoryReadEvent()
 {
+    assert(!nextMemoryReadEvent.pending());
     if (memQueueFull()) {
-        nextMemoryReadEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        nextMemoryReadEvent.sleep();
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextMemoryReadEvent");
         return;
     }
 
@@ -351,8 +357,33 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
-    // assert(!nextEvictEvent.scheduled());
-    // schedule(nextEvictEvent, nextCycle());
+    assert(!pendingEventQueue.empty());
+    std::string front = pendingEventQueue.front();
+
+    if (front == "nextMemoryReadEvent") {
+        assert(!nextMemoryReadEvent.scheduled());
+        assert(nextMemoryReadEvent.pending());
+        schedule(nextMemoryReadEvent, nextCycle());
+        nextMemoryReadEvent.wake();
+    } else if (front == "nextWriteBackEvent") {
+        assert(!nextWriteBackEvent.scheduled());
+        assert(nextWriteBackEvent.pending());
+        schedule(nextWriteBackEvent, nextCycle());
+        nextWriteBackEvent.wake();
+    } else if (front == "nextSendRetryEvent") {
+        assert(!nextSendRetryEvent.scheduled());
+        assert(nextSendRetryEvent.pending());
+        breakPointFunction();
+        schedule(nextSendRetryEvent, nextCycle());
+        nextSendRetryEvent.wake();
+    } else {
+        panic("EVENT IS NOT RECOGNIZED.\n");
+    }
+
+    pendingEventQueue.pop_front();
+    if (!pendingEventQueue.empty()) {
+        requestMemRetry(1);
+    }
     return;
 }
 
@@ -652,10 +683,16 @@ CoalesceEngine::processNextApplyEvent()
 void
 CoalesceEngine::processNextWriteBackEvent()
 {
+    assert(!nextWriteBackEvent.pending());
     if (memQueueFull()) {
         nextWriteBackEvent.sleep();
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
+        if (!pendingMemRetry()) {
+            assert(pendingEventQueue.empty());
+            requestMemRetry(1);
+        }
+        pendingEventQueue.push_back("nextWriteBackEvent");
         return;
     }
 
@@ -715,20 +752,25 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
-    if (!nextSendRetryEvent.scheduled()) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(!nextSendRetryEvent.pending());
+    assert(!nextSendRetryEvent.scheduled());
+    assert(numRetriesReceived == 1);
+    schedule(nextSendRetryEvent, nextCycle());
 }
 
 void
 CoalesceEngine::processNextSendRetryEvent()
 {
-    if (needsPush.count() == 0) {
-        DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-                        "bit in needsPush. Rejecting the retry.\n", __func__);
-        peerPushEngine->recvRetryReject();
-        return;
-    }
+    assert(!nextSendRetryEvent.pending());
+    assert(needsPush.count() != 0);
+    // if (needsPush.count() == 0) {
+    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
+    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
+    //     peerPushEngine->recvRetryReject();
+    //     return;
+    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -807,6 +849,16 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
+        if (memQueueFull()) {
+            nextSendRetryEvent.sleep();
+            if (!pendingMemRetry()) {
+                assert(pendingEventQueue.empty());
+                requestMemRetry(1);
+            }
+            pendingEventQueue.push_back("nextSendRetryEvent");
+            return;
+        }
+
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
@@ -816,17 +868,12 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        if (allocateMemQueueSpace(1)) {
-            enqueueMemReq(pkt);
-        } else {
-            requestMemRetry(1);
-        }
+        enqueueMemReq(pkt);
     }
 
     numRetriesReceived--;
-    if ((numRetriesReceived > 0) && (!nextSendRetryEvent.scheduled())) {
-        schedule(nextSendRetryEvent, nextCycle());
-    }
+    assert(numRetriesReceived == 0);
+    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 563fa671b3..83ca6e5f14 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -124,6 +124,8 @@ class CoalesceEngine : public BaseMemEngine
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
 
+    std::deque<std::string> pendingEventQueue;
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
@@ -136,7 +138,7 @@ class CoalesceEngine : public BaseMemEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
+    MemoryEvent nextSendRetryEvent;
     void processNextSendRetryEvent();
 
     struct CoalesceStats : public statistics::Group
@@ -159,6 +161,7 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
+    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
     virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5ab8db401c..c64ff003c4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,6 +43,7 @@ PushEngine::PushEngine(const PushEngineParams &params):
     numTotalRetries(0), numPendingRetries(0),
     nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -121,8 +122,8 @@ PushEngine::deallocatePushSpace(int space)
             (numPendingRetries == 0)) {
             DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-            numPendingRetries++;
-            peerCoalesceEngine->recvPushRetry();
+            assert(!nextSendRetryEvent.scheduled());
+            schedule(nextSendRetryEvent, nextCycle());
         }
     }
 }
@@ -221,8 +222,8 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                numPendingRetries++;
-                peerCoalesceEngine->recvPushRetry();
+                assert(!nextSendRetryEvent.scheduled());
+                schedule(nextSendRetryEvent, nextCycle());
             }
         }
     }
@@ -239,6 +240,14 @@ PushEngine::processNextAddrGenEvent()
     }
 }
 
+void
+PushEngine::processNextSendRetryEvent()
+{
+    assert(numPendingRetries == 0);
+    numPendingRetries++;
+    peerCoalesceEngine->recvPushRetry();
+}
+
 void
 PushEngine::recvMemRetry()
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a3a308554f..378cd1a487 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -128,6 +128,9 @@ class PushEngine : public BaseMemEngine
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
 
+    EventFunctionWrapper nextSendRetryEvent;
+    void processNextSendRetryEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);

From 42ff3b88231d9f69c4f0fcb7ccbddfc2db66d799 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:43:22 -0700
Subject: [PATCH 124/287] Adding DPRINTF for structure sizes.

---
 src/accl/graph/SConscript              |   2 +-
 src/accl/graph/base/base_mem_engine.cc |  47 +++++---
 src/accl/graph/base/base_mem_engine.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 src/accl/graph/sega/push_engine.cc     |   5 +-
 src/accl/graph/sega/wl_engine.cc       | 151 +++++++++++++++----------
 src/accl/graph/sega/wl_engine.hh       |   8 +-
 7 files changed, 134 insertions(+), 85 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index f5f7e962af..7fd3591b2c 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -27,5 +27,5 @@
 
 Import('*')
 
-
+DebugFlag('SEGAStructureSize')
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index cb4c1d81bb..aa78aac8b5 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/base/base_mem_engine.hh"
 
 #include "debug/BaseMemEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 
 namespace gem5
 {
@@ -37,7 +38,7 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     ClockedObject(params),
     system(params.system),
     memPort(name() + ".mem_port", this),
-    outstandingMemReqQueueSize(params.outstanding_mem_req_queue_size),
+    memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
     respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
@@ -99,17 +100,22 @@ BaseMemEngine::processNextMemReqEvent()
 {
     if ((respQueueSize == 0) ||
         ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = outstandingMemReqQueue.front();
+        PacketPtr pkt = memQueue.front();
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
         DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
                     "pkt->addr: %lu, pkt->size: %lu.\n",
                     __func__, pkt->getAddr(), pkt->getSize());
-        outstandingMemReqQueue.pop_front();
-
+        memQueue.pop_front();
+        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
+        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
+                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
+                __func__, pkt->print(), memQueue.size(), memQueueSize);
         if (memRetryRequested &&
-            (outstandingMemReqQueue.size() <=
-            (outstandingMemReqQueueSize - memSpaceRequested))) {
+            (memQueue.size() <=
+            (memQueueSize - memSpaceRequested))) {
             memRetryRequested = false;
             memSpaceRequested = 0;
             recvMemRetry();
@@ -117,7 +123,7 @@ BaseMemEngine::processNextMemReqEvent()
     }
 
     if ((!memPort.blocked()) &&
-        (!outstandingMemReqQueue.empty()) && (!nextMemReqEvent.scheduled())) {
+        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
@@ -156,30 +162,35 @@ BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
 bool
 BaseMemEngine::allocateMemQueueSpace(int space)
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= (outstandingMemReqQueueSize - space))
+        (memQueueSize == 0) ||
+        (memQueue.size() <= (memQueueSize - space))
         );
 }
 
 bool
 BaseMemEngine::memQueueFull()
 {
-    assert((outstandingMemReqQueueSize == 0) ||
-        (outstandingMemReqQueue.size() <= outstandingMemReqQueueSize));
+    assert((memQueueSize == 0) ||
+        (memQueue.size() <= memQueueSize));
     return (
-        (outstandingMemReqQueueSize != 0) &&
-        (outstandingMemReqQueue.size() == outstandingMemReqQueueSize));
+        (memQueueSize != 0) &&
+        (memQueue.size() == memQueueSize));
 }
 
 void
 BaseMemEngine::enqueueMemReq(PacketPtr pkt)
 {
     panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    outstandingMemReqQueue.push_back(pkt);
-
+    memQueue.push_back(pkt);
+    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
+    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
+                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
+                pkt->print(), memQueue.size(), memQueueSize);
     if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
         schedule(nextMemReqEvent, nextCycle());
     }
@@ -199,7 +210,7 @@ void
 BaseMemEngine::wakeUp()
 {
     assert(!nextMemReqEvent.scheduled());
-    if (!outstandingMemReqQueue.empty()) {
+    if (!memQueue.empty()) {
         schedule(nextMemReqEvent, nextCycle());
     }
 }
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 64ef49ee1d..520970c5a0 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -68,12 +68,12 @@ class BaseMemEngine : public ClockedObject
     System* system;
     MemPort memPort;
 
-    int outstandingMemReqQueueSize;
+    int memQueueSize;
     int onTheFlyReqs;
     int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
-    std::deque<PacketPtr> outstandingMemReqQueue;
+    std::deque<PacketPtr> memQueue;
 
     EventFunctionWrapper nextMemReqEvent;
     void processNextMemReqEvent();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4a0600e9c0..ea572ea749 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -862,7 +862,7 @@ CoalesceEngine::processNextSendRetryEvent()
         // FIXME: Fix the retry mechanism between memory and cache to
         // handle memory retries correctly. This probably requires scheduling
         // an event for sending the retry. For now we're enabling infinite
-        // queueing in the outstandingMemReqQueue.
+        // queueing in the memQueue.
         // FIXME: Also do not send requests for cache lines that are already
         // read but await data. Just set a flag or sth.
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c64ff003c4..d745dabef6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -222,8 +222,9 @@ PushEngine::processNextAddrGenEvent()
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "
                             "peerCoalesceEngine.\n", __func__);
-                assert(!nextSendRetryEvent.scheduled());
-                schedule(nextSendRetryEvent, nextCycle());
+                if (!nextSendRetryEvent.scheduled()) {
+                    schedule(nextSendRetryEvent, nextCycle());
+                }
             }
         }
     }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 79bf046ba3..2d4ffc9cac 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
 
@@ -39,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    onTheFlyUpdateMapSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.on_the_fly_update_map_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -129,45 +130,68 @@ WLEngine::processNextReadEvent()
     uint32_t update_value;
     std::tie(update_addr, update_value) = updateQueue.front();
 
-    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. Addr: %lu, "
-                "value: %u.\n", __func__, update_addr, update_value);
+    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+            "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
 
-    if ((onTheFlyUpdateMap.find(update_addr) == onTheFlyUpdateMap.end())) {
-        DPRINTF(WLEngine,  "%s: Did not find the addr: %lu in onTheFlyUpdateMap.\n",
-                    __func__, update_addr);
-        if (onTheFlyUpdateMap.size() < onTheFlyUpdateMapSize) {
+    if ((registerFile.find(update_addr) == registerFile.end())) {
+        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                            "in registerFile.\n", __func__, update_addr);
+        if (registerFile.size() < registerFileSize) {
+            DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                            "registerFile.\n", __func__);
+            // TODO: It might be a good idea for WLEngine to act differently
+            // on cache rejects. As a first step the cache should not just
+            // return a boolean value. It should return an integer/enum
+            // to tell WLEngine why it rejected the read request. Their might
+            // be things that WLEngine can do to fix head of the line blocking.
             if (coalesceEngine->recvWLRead(update_addr)) {
-                onTheFlyUpdateMap[update_addr] = update_value;
-                DPRINTF(WLEngine,  "%s: Added a new item to onTheFlyUpdateMap. "
-                            "onTheFlyUpdateMap[%lu] = %u.\n", __func__,
-                            update_addr, onTheFlyUpdateMap[update_addr]);
+                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                            "request to addr: %lu.\n", __func__, update_addr);
+                registerFile[update_addr] = update_value;
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
+                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                        "to registerFile. registerFile.size = %d, "
+                        "registerFileSize = %d.\n", __func__, update_addr,
+                        update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                            ". updateQueue.size = %u.\n",
-                            __func__, updateQueue.size());
+                DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+                DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
                 respPort.checkRetryReq();
             }
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
-        DPRINTF(WLEngine,  "%s: Found the addr: %lu in onTheFlyUpdateMap. "
-                    "onTheFlyUpdateMap[%lu] = %u.\n", __func__, update_addr,
-                    update_addr, onTheFlyUpdateMap[update_addr]);
-        onTheFlyUpdateMap[update_addr] =
-                std::min(update_value, onTheFlyUpdateMap[update_addr]);
-        DPRINTF(WLEngine,  "%s: Reduced the update_value with the entry in "
-                    "onTheFlyUpdateMap. onTheFlyUpdateMap[%lu] = %u.\n",
-                    __func__, update_addr, onTheFlyUpdateMap[update_addr]);
-        stats.onTheFlyCoalesce++;
+        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                __func__, update_addr, update_addr, registerFile[update_addr]);
+        registerFile[update_addr] =
+                std::min(update_value, registerFile[update_addr]);
+        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                    " registerFile. registerFile[%lu] = %u.\n", __func__,
+                    update_value, update_addr, registerFile[update_addr]);
+        stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        DPRINTF(WLEngine,  "%s: Popped an item from the front of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                            "from updateQueue. updateQueue.size = %d. "
+                            "updateQueueSize = %d.\n", __func__, update_addr,
+                            update_value, updateQueue.size(), updateQueueSize);
+        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                    "from updateQueue. updateQueue.size = %d. "
+                    "updateQueueSize = %d.\n", __func__, update_addr,
+                    update_value, updateQueue.size(), updateQueueSize);
         respPort.checkRetryReq();
     }
 
-    // TODO: Only schedule nextReadEvent only when it has to be scheduled
-    if ((!nextReadEvent.scheduled()) && (!updateQueue.empty())) {
+    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -175,14 +199,16 @@ WLEngine::processNextReadEvent()
 void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
-    assert(addrWorkListMap.size() <= onTheFlyUpdateMapSize);
+    assert(workListFile.size() <= registerFileSize);
 
-    addrWorkListMap[addr] = wl;
-    DPRINTF(WLEngine,  "%s: Received a WorkListItem from the coalesceEngine. Adding"
-                " it to the addrWorkListMap. addrWorkListMap[%lu] = %s.\n",
-                __func__, addr, wl.to_string());
-
-    assert(!addrWorkListMap.empty());
+    workListFile[addr] = wl;
+    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
+                "workListFile. workListFile.size = %d.\n", __func__, addr,
+                                    wl.to_string(), workListFile.size());
+    assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -191,28 +217,31 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    for (auto &it : addrWorkListMap) {
+    for (auto &it : workListFile) {
         Addr addr = it.first;
-        assert(onTheFlyUpdateMap.find(addr) != onTheFlyUpdateMap.end());
-        uint32_t update_value = onTheFlyUpdateMap[addr];
-        DPRINTF(WLEngine,  "%s: Reducing between onTheFlyUpdateMap and "
-                    "addrWorkListMap values. onTheFlyUpdateMap[%lu] = %u, "
-                    "addrWorkListMap[%lu] = %s.\n", __func__,
-                                addr, onTheFlyUpdateMap[addr],
-                                addr, addrWorkListMap[addr].to_string());
+        assert(registerFile.find(addr) != registerFile.end());
+        uint32_t update_value = registerFile[addr];
+        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+                                        __func__, addr, registerFile[addr],
+                                        addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
-        addrWorkListMap[addr].tempProp =
-                    std::min(update_value, addrWorkListMap[addr].tempProp);
-        DPRINTF(WLEngine,  "%s: Reduction done. addrWorkListMap[%lu] = %s.\n",
-                    __func__, addr, addrWorkListMap[addr].to_string());
+        workListFile[addr].tempProp =
+                    std::min(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+                            __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, addrWorkListMap[addr]);
-        onTheFlyUpdateMap.erase(addr);
-        DPRINTF(WLEngine,  "%s: Erased addr: %lu from onTheFlyUpdateMap.\n",
-                    __func__, addr);
+        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        registerFile.erase(addr);
+        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
+        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+                    "registerFile.size = %d, registerFileSize = %d\n",
+                    __func__, addr, registerFile.size(), registerFileSize);
     }
-    addrWorkListMap.clear();
+    workListFile.clear();
 }
 
 bool
@@ -224,11 +253,19 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(WLEngine,  "%s: Pushed an item to the back of updateQueue"
-                                        ". updateQueue.size = %u.\n",
-                                        __func__, updateQueue.size());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+
+
+    // delete the packet since it's not needed anymore.
     delete pkt;
-    assert(!updateQueue.empty());
+
     if (!nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
@@ -241,7 +278,7 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
 
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
-    ADD_STAT(onTheFlyCoalesce, statistics::units::Count::get(),
+    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies")
 {
 }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 27fc3efa7a..79fe60f6d0 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -74,10 +74,10 @@ class WLEngine : public BaseReduceEngine
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
-    int onTheFlyUpdateMapSize;
-    std::unordered_map<Addr, uint32_t> onTheFlyUpdateMap;
+    int registerFileSize;
+    std::unordered_map<Addr, uint32_t> registerFile;
 
-    std::unordered_map<Addr, WorkListItem> addrWorkListMap;
+    std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
 
@@ -98,7 +98,7 @@ class WLEngine : public BaseReduceEngine
       WLEngine &wl;
 
       statistics::Scalar numReduce;
-      statistics::Scalar onTheFlyCoalesce;
+      statistics::Scalar registerFileCoalesce;
     };
 
     WorkListStats stats;

From 5f513830921f24659a9e7fcb8aea10720a27840a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 24 Jul 2022 17:44:06 -0700
Subject: [PATCH 125/287] Updating config script for sega.

---
 configs/accl/sega.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ffd74241e7..cf189733f0 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,20 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=4,
+                                    push_req_queue_size=16,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    outstanding_mem_req_queue_size=4,
+                                    resp_queue_size=8)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
                                     cache_size="1MiB",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=1)
+                                    num_mshr_entry=8,
+                                    num_tgts_per_mshr=8,
+                                    outstanding_mem_req_queue_size=8)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=16,
+                                on_the_fly_update_map_size=8)
 
     def getRespPort(self):
         return self.wl_engine.resp_port

From ed206a8acdb86f3aa17df9e1d3d44e241385c67e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:14:08 -0700
Subject: [PATCH 126/287] Adding more assertion for MSHR and fillQueue.

---
 configs/accl/sega.py                   | 12 ++++++------
 src/accl/graph/sega/coalesce_engine.cc |  3 +++
 src/accl/graph/sega/push_engine.cc     |  2 +-
 3 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index cf189733f0..8fb3b75996 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -16,13 +16,13 @@ def __init__(self, base_edge_addr):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="1MiB",
-                                    num_mshr_entry=8,
-                                    num_tgts_per_mshr=8,
-                                    outstanding_mem_req_queue_size=8)
+                                    cache_size="128B",
+                                    num_mshr_entry=1,
+                                    num_tgts_per_mshr=1,
+                                    outstanding_mem_req_queue_size=0)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=16,
-                                on_the_fly_update_map_size=8)
+                                update_queue_size=1,
+                                on_the_fly_update_map_size=4)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ea572ea749..8f56962a8c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -232,6 +232,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     return true;
                 } else {
                     assert(!cacheBlocks[block_index].valid);
+                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
@@ -251,6 +252,7 @@ CoalesceEngine::recvWLRead(Addr addr)
 
                     // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
+                    assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
@@ -737,6 +739,7 @@ CoalesceEngine::processNextWriteBackEvent()
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
+        assert(fillQueue.size() <= numLines);
     }
 
     writeBackQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d745dabef6..a41ca8a778 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -217,7 +217,7 @@ PushEngine::processNextAddrGenEvent()
             int free_space = pushReqQueueSize -
             (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
             DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        "free spaces.\n", __func__, free_space);
+                        " free spaces.\n", __func__, free_space);
             if ((free_space >= numElementsPerLine) &&
                 (numPendingRetries == 0)) {
                 DPRINTF(PushEngine, "%s: Sent a push retry to "

From cdfd9817d9a3908fc86b2ec1f95420524b953ea3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:27:10 -0700
Subject: [PATCH 127/287] Adding debug flags for responseQueue size.

---
 src/accl/graph/sega/coalesce_engine.cc | 41 +++++++++++++++++++-------
 src/accl/graph/sega/wl_engine.hh       |  2 ++
 2 files changed, 32 insertions(+), 11 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f56962a8c..959bfa9743 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,6 +32,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -168,11 +169,18 @@ CoalesceEngine::recvWLRead(Addr addr)
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit. Pushed cacheBlocks[%d][%d]: %s "
-            "to responseQueue. responseQueue.size = %d.\n",
-            __func__, addr, block_index, wl_offset,
-            cacheBlocks[block_index].items[wl_offset].to_string(),
-            responseQueue.size());
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
@@ -345,9 +353,12 @@ CoalesceEngine::processNextRespondEvent()
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
 
     if ((!nextRespondEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -536,10 +547,18 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
-            DPRINTF(CoalesceEngine,  "%s: Pushed cacheBlocks[%d][%d] to "
-                    "responseQueue. responseQueue.size = %u.\n"
-                    , __func__, block_index, wl_offset,
-                    responseQueue.size());
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(), 
+                        peerWLEngine->getRegisterFileSize());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d, "
+                        "responseQueueSize = %d.\n", __func__, addr,
+                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        responseQueue.size(),
+                        peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 79fe60f6d0..5e8e5b25f3 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -114,6 +114,8 @@ class WLEngine : public BaseReduceEngine
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
+
+    int getRegisterFileSize() { return registerFileSize; }
 };
 
 }

From 4a466aec9457f93be6bfa689489c8376c08d31c6 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 18:33:53 -0700
Subject: [PATCH 128/287] Adding assertions to test the size of queues in
 coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 10 +++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 -
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 959bfa9743..753bfc988b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     numRetriesReceived(0),
     applyQueue(numLines),
     writeBackQueue(numLines),
-    replaceQueue(numLines),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
@@ -320,6 +319,8 @@ CoalesceEngine::processNextMemoryReadEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextMemoryReadEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -460,6 +461,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     }
                     if (cacheBlocks[block_index].hasConflict) {
                         writeBackQueue.push_back(block_index);
+                        assert(writeBackQueue.size() <= numLines);
                         if ((!nextWriteBackEvent.pending()) &&
                             (!nextWriteBackEvent.scheduled())) {
                             schedule(nextWriteBackEvent, nextCycle());
@@ -683,6 +685,7 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
+        assert(writeBackQueue.size() <= numLines);
         DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
                 __func__, block_index, writeBackQueue.size());
     }
@@ -714,6 +717,8 @@ CoalesceEngine::processNextWriteBackEvent()
             requestMemRetry(1);
         }
         pendingEventQueue.push_back("nextWriteBackEvent");
+        // Maximum three MemoryEvent.
+        assert(pendingEventQueue.size() <= 3);
         return;
     }
 
@@ -863,6 +868,7 @@ CoalesceEngine::processNextSendRetryEvent()
             }
             if (cacheBlocks[block_index].hasConflict) {
                 writeBackQueue.push_back(block_index);
+                assert(writeBackQueue.size() <= numLines);
                 if ((!writeBackQueue.empty()) &&
                     (!nextWriteBackEvent.pending()) &&
                     (!nextWriteBackEvent.scheduled())) {
@@ -878,6 +884,8 @@ CoalesceEngine::processNextSendRetryEvent()
                 requestMemRetry(1);
             }
             pendingEventQueue.push_back("nextSendRetryEvent");
+            // Maximum three MemoryEvent.
+            assert(pendingEventQueue.size() <= 3);
             return;
         }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 83ca6e5f14..cfa0a79102 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,7 +118,6 @@ class CoalesceEngine : public BaseMemEngine
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     InOutSet<int> writeBackQueue;
-    InOutSet<int> replaceQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);

From 48711528ef72651cccb68b08303159ce8b3fc071 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 24 Jul 2022 22:43:28 -0700
Subject: [PATCH 129/287] Checking the size of queues in PushEngine and
 WLEngine

---
 src/accl/graph/base/base_mem_engine.cc | 2 +-
 src/accl/graph/base/base_mem_engine.hh | 3 ++-
 src/accl/graph/sega/push_engine.cc     | 2 ++
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
index aa78aac8b5..590307b2bc 100644
--- a/src/accl/graph/base/base_mem_engine.cc
+++ b/src/accl/graph/base/base_mem_engine.cc
@@ -40,10 +40,10 @@ BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
     memPort(name() + ".mem_port", this),
     memQueueSize(params.outstanding_mem_req_queue_size),
     onTheFlyReqs(0),
-    respQueueSize(params.resp_queue_size),
     memRetryRequested(false),
     memSpaceRequested(0),
     nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
+    respQueueSize(params.resp_queue_size),
     _requestorId(system->getRequestorId(this)),
     peerMemoryAtomSize(params.attached_memory_atom_size)
 {}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
index 520970c5a0..01c862d555 100644
--- a/src/accl/graph/base/base_mem_engine.hh
+++ b/src/accl/graph/base/base_mem_engine.hh
@@ -70,7 +70,6 @@ class BaseMemEngine : public ClockedObject
 
     int memQueueSize;
     int onTheFlyReqs;
-    int respQueueSize;
     bool memRetryRequested;
     int memSpaceRequested;
     std::deque<PacketPtr> memQueue;
@@ -79,6 +78,8 @@ class BaseMemEngine : public ClockedObject
     void processNextMemReqEvent();
 
   protected:
+
+    int respQueueSize;
     const RequestorID _requestorId;
 
     size_t peerMemoryAtomSize;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a41ca8a778..cfebf8e5df 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -173,6 +173,7 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                                     peerMemoryAtomSize, value);
+    assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -263,6 +264,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
+    assert(memRespQueue.size() <= respQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());

From 29ae1de4908cf215a44bcd8c9db9091c8306cf1b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 09:33:11 -0700
Subject: [PATCH 130/287] Making CoalesceEngine a BaseMemoryEngine.

---
 configs/accl/sega.py                      |  13 ++-
 src/accl/graph/sega/BaseMemoryEngine.py   |  42 ++++++++
 src/accl/graph/sega/CoalesceEngine.py     |  17 ++-
 src/accl/graph/sega/SConscript            |   3 +
 src/accl/graph/sega/base_memory_engine.cc | 122 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.hh |  99 ++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc    |  70 ++++---------
 src/accl/graph/sega/coalesce_engine.hh    |  14 +--
 8 files changed, 305 insertions(+), 75 deletions(-)
 create mode 100644 src/accl/graph/sega/BaseMemoryEngine.py
 create mode 100644 src/accl/graph/sega/base_memory_engine.cc
 create mode 100644 src/accl/graph/sega/base_memory_engine.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8fb3b75996..7577331f2b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -9,20 +9,19 @@ class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
         self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=16,
+                                    push_req_queue_size=2,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=4,
-                                    resp_queue_size=8)
+                                    outstanding_mem_req_queue_size=1,
+                                    resp_queue_size=1)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="128B",
+                                    cache_size="32B",
                                     num_mshr_entry=1,
-                                    num_tgts_per_mshr=1,
-                                    outstanding_mem_req_queue_size=0)
+                                    num_tgts_per_mshr=1)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
                                 update_queue_size=1,
-                                on_the_fly_update_map_size=4)
+                                on_the_fly_update_map_size=1)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
diff --git a/src/accl/graph/sega/BaseMemoryEngine.py b/src/accl/graph/sega/BaseMemoryEngine.py
new file mode 100644
index 0000000000..10d8b708f0
--- /dev/null
+++ b/src/accl/graph/sega/BaseMemoryEngine.py
@@ -0,0 +1,42 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class BaseMemoryEngine(ClockedObject):
+    abstract = True
+    type = 'BaseMemoryEngine'
+    cxx_header = "accl/graph/sega/base_memory_engine.hh"
+    cxx_class = 'gem5::BaseMemoryEngine'
+
+    system = Param.System(Parent.any, 'System this Engine is a part of')
+    mem_port  = RequestPort("Port to communicate with the memory")
+
+    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
+                                    "memory.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 7667a22c5a..536c3477ae 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,21 +27,16 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class CoalesceEngine(BaseMemEngine):
+class CoalesceEngine(BaseMemoryEngine):
     type = 'CoalesceEngine'
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "")
-
-    cache_size = Param.MemorySize("16KiB", "Size of the internal cache.")
-
-    num_mshr_entry = Param.Int(4, "")
-    num_tgts_per_mshr = Param.Int(20, "")
-
-    # Don't change. If changed. It will break functionality of coalesce.
-    resp_queue_size = 0
+    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
+    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
 
+    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 77e508f4ed..97a62d44a0 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -27,16 +27,19 @@
 
 Import('*')
 
+SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
+Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
+DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
new file mode 100644
index 0000000000..e5e78f2c04
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -0,0 +1,122 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/base_memory_engine.hh"
+
+#include "debug/BaseMemoryEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+
+namespace gem5
+{
+
+BaseMemoryEngine::BaseMemoryEngine(const BaseMemoryEngineParams &params):
+    ClockedObject(params),
+    system(params.system),
+    _requestorId(system->getRequestorId(this)),
+    memPort(name() + ".mem_port", this),
+    peerMemoryAtomSize(params.attached_memory_atom_size)
+{}
+
+BaseMemoryEngine::~BaseMemoryEngine()
+{}
+
+Port&
+BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
+{
+    if (if_name == "mem_port") {
+        return memPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+        _blocked = true;
+    } else {
+        owner->recvMemRetry();
+    }
+}
+
+bool
+BaseMemoryEngine::MemPort::recvTimingResp(PacketPtr pkt)
+{
+    return owner->handleMemResp(pkt);
+}
+
+void
+BaseMemoryEngine::MemPort::recvReqRetry()
+{
+    panic_if(!(_blocked && blockedPacket),
+            "Received retry without a blockedPacket");
+
+    _blocked = false;
+    sendPacket(blockedPacket);
+
+    if (!blocked()) {
+        blockedPacket = nullptr;
+    }
+}
+
+PacketPtr
+BaseMemoryEngine::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
+PacketPtr
+BaseMemoryEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
+
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) _requestorId) << 2);
+
+    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
+    pkt->allocate();
+    pkt->setData(data);
+
+    return pkt;
+}
+
+}
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
new file mode 100644
index 0000000000..8fb8fde7e6
--- /dev/null
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -0,0 +1,99 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
+
+#include <unordered_map>
+
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "params/BaseMemoryEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
+
+namespace gem5
+{
+
+class BaseMemoryEngine : public ClockedObject
+{
+  private:
+    class MemPort : public RequestPort
+    {
+      private:
+        BaseMemoryEngine* owner;
+        bool _blocked;
+        PacketPtr blockedPacket;
+
+        public:
+        MemPort(const std::string& name, BaseMemoryEngine* owner):
+            RequestPort(name, owner), owner(owner),
+            _blocked(false), blockedPacket(nullptr)
+        {}
+
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return _blocked; }
+
+        protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+  protected:
+    System* system;
+    const RequestorID _requestorId;
+
+    MemPort memPort;
+
+    size_t peerMemoryAtomSize;
+
+    virtual void recvMemRetry() = 0;
+    virtual bool handleMemResp(PacketPtr pkt) = 0;
+
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+
+  public:
+    PARAMS(BaseMemoryEngine);
+
+    BaseMemoryEngine(const Params &params);
+    ~BaseMemoryEngine();
+
+    Port& getPort(const std::string &if_name,
+                  PortID idx=InvalidPortID) override;
+
+    AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
+
+    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_BASE_MEMORY_ENGINE_HH__
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 753bfc988b..678cf0456e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -38,8 +38,8 @@
 namespace gem5
 {
 
-CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
-    BaseMemEngine(params),
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
@@ -67,12 +67,6 @@ CoalesceEngine::CoalesceEngine(const CoalesceEngineParams &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    sendMemFunctional(pkt);
-}
-
 void
 CoalesceEngine::startup()
 {
@@ -171,13 +165,13 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
         // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -257,7 +251,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    // enqueueMemReq(pkt);
                     fillQueue.push_back(block_index);
                     assert(fillQueue.size() <= numLines);
                     // FIXME: Fix this DPRINTF
@@ -310,16 +303,12 @@ void
 CoalesceEngine::processNextMemoryReadEvent()
 {
     assert(!nextMemoryReadEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         // TODO: Implement interface where events of the CoalesceEngine are
         // pushed to a fifo to be scheduled later.
         nextMemoryReadEvent.sleep();
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvent.
+        // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
         return;
     }
@@ -330,7 +319,7 @@ CoalesceEngine::processNextMemoryReadEvent()
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
-    enqueueMemReq(pkt);
+    memPort.sendPacket(pkt);
 
     fillQueue.pop_front();
 
@@ -367,11 +356,13 @@ CoalesceEngine::processNextRespondEvent()
     }
 }
 
-// FIXME: Update this for implementing event retry interaction.
 void
 CoalesceEngine::recvMemRetry()
 {
-    assert(!pendingEventQueue.empty());
+    if (pendingEventQueue.empty()) {
+        return;
+    }
+
     std::string front = pendingEventQueue.front();
 
     if (front == "nextMemoryReadEvent") {
@@ -387,7 +378,6 @@ CoalesceEngine::recvMemRetry()
     } else if (front == "nextSendRetryEvent") {
         assert(!nextSendRetryEvent.scheduled());
         assert(nextSendRetryEvent.pending());
-        breakPointFunction();
         schedule(nextSendRetryEvent, nextCycle());
         nextSendRetryEvent.wake();
     } else {
@@ -395,12 +385,10 @@ CoalesceEngine::recvMemRetry()
     }
 
     pendingEventQueue.pop_front();
-    if (!pendingEventQueue.empty()) {
-        requestMemRetry(1);
-    }
     return;
 }
 
+// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
@@ -552,13 +540,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
-                        responseQueue.size(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d, "
                         "responseQueueSize = %d.\n", __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(), 
+                        cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
@@ -708,14 +696,8 @@ void
 CoalesceEngine::processNextWriteBackEvent()
 {
     assert(!nextWriteBackEvent.pending());
-    if (memQueueFull()) {
+    if (memPort.blocked()) {
         nextWriteBackEvent.sleep();
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        if (!pendingMemRetry()) {
-            assert(pendingEventQueue.empty());
-            requestMemRetry(1);
-        }
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
@@ -744,7 +726,7 @@ CoalesceEngine::processNextWriteBackEvent()
             DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
-            enqueueMemReq(write_pkt);
+            memPort.sendPacket(write_pkt);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -764,6 +746,10 @@ CoalesceEngine::processNextWriteBackEvent()
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
         fillQueue.push_back(block_index);
         assert(fillQueue.size() <= numLines);
+        if ((!nextMemoryReadEvent.pending()) &&
+            (!nextMemoryReadEvent.scheduled())){
+            schedule(nextMemoryReadEvent, nextCycle());
+        }
     }
 
     writeBackQueue.pop_front();
@@ -792,12 +778,6 @@ CoalesceEngine::processNextSendRetryEvent()
 {
     assert(!nextSendRetryEvent.pending());
     assert(needsPush.count() != 0);
-    // if (needsPush.count() == 0) {
-    //     DPRINTF(CoalesceEngine, "%s: Received a retry while there are no set "
-    //                     "bit in needsPush. Rejecting the retry.\n", __func__);
-    //     peerPushEngine->recvRetryReject();
-    //     return;
-    // }
 
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     Addr block_addr = 0;
@@ -877,12 +857,8 @@ CoalesceEngine::processNextSendRetryEvent()
             }
         }
     } else {
-        if (memQueueFull()) {
+        if (memPort.blocked()) {
             nextSendRetryEvent.sleep();
-            if (!pendingMemRetry()) {
-                assert(pendingEventQueue.empty());
-                requestMemRetry(1);
-            }
             pendingEventQueue.push_back("nextSendRetryEvent");
             // Maximum three MemoryEvent.
             assert(pendingEventQueue.size() <= 3);
@@ -898,7 +874,7 @@ CoalesceEngine::processNextSendRetryEvent()
         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
         SenderState* sender_state = new SenderState(true);
         pkt->pushSenderState(sender_state);
-        enqueueMemReq(pkt);
+        memPort.sendPacket(pkt);
     }
 
     numRetriesReceived--;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index cfa0a79102..a322379b05 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,7 +31,7 @@
 
 #include <bitset>
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "base/statistics.hh"
@@ -39,14 +39,12 @@
 
 #define MAX_BITVECTOR_SIZE (1 << 30)
 
-// TODO: Add parameters for size, memory atom size, type size,
-// length of items in the blocks.
 namespace gem5
 {
 
 class WLEngine;
 
-class CoalesceEngine : public BaseMemEngine
+class CoalesceEngine : public BaseMemoryEngine
 {
   private:
     class MemoryEvent : public EventFunctionWrapper
@@ -160,16 +158,14 @@ class CoalesceEngine : public BaseMemEngine
 
     CoalesceStats stats;
 
-    void breakPointFunction() { std::cout << "Salaam." << std::endl; }
   protected:
-    virtual int respBuffSize() { return -1; }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(CoalesceEngine);
 
-    CoalesceEngine(const CoalesceEngineParams &params);
+    CoalesceEngine(const Params &params);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -178,9 +174,7 @@ class CoalesceEngine : public BaseMemEngine
 
     void recvPushRetry();
 
-    void recvFunctional(PacketPtr pkt);
-
-    virtual void startup();
+    virtual void startup() override;
 };
 
 }

From bbc7e3afbea04fd283157f89d024f4f9b9c2d78d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 13:06:22 -0700
Subject: [PATCH 131/287] Fixing cache mapping issue.

---
 src/accl/graph/SConscript                 |   3 +-
 src/accl/graph/sega/base_memory_engine.cc |  14 +++
 src/accl/graph/sega/base_memory_engine.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc    | 105 ++++++++++++----------
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 5 files changed, 78 insertions(+), 52 deletions(-)

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 7fd3591b2c..53c6411de6 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,4 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine', 'BaseMemEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
+                    'BaseMemEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index e5e78f2c04..9db95d6bd6 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -55,6 +55,20 @@ BaseMemoryEngine::getPort(const std::string &if_name, PortID idx)
     }
 }
 
+void
+BaseMemoryEngine::init()
+{
+    AddrRangeList memory_ranges = memPort.getAddrRanges();
+    // BaseMemoryEngine only supports one memory.
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
+                            "The range is %s interleaved.\n", __func__,
+                            peerMemoryRange.to_string(),
+                            peerMemoryRange.interleaved() ? "" : "not");
+}
+
 void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 8fb8fde7e6..efbfa5312d 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -69,6 +69,7 @@ class BaseMemoryEngine : public ClockedObject
     System* system;
     const RequestorID _requestorId;
 
+    AddrRange peerMemoryRange;
     MemPort memPort;
 
     size_t peerMemoryAtomSize;
@@ -92,6 +93,7 @@ class BaseMemoryEngine : public ClockedObject
 
     void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
+    virtual void init() override;
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 678cf0456e..21f048213a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -67,44 +67,48 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-void
-CoalesceEngine::startup()
-{
-    AddrRangeList vertex_ranges = getAddrRanges();
-
-    bool found = false;
-    Addr first_match_addr = 0;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(first_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        first_match_addr += peerMemoryAtomSize;
-    }
-
-    found = false;
-    Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    while(true) {
-        for (auto range: vertex_ranges) {
-            if (range.contains(second_match_addr)) {
-                found = true;
-                break;
-            }
-        }
-        if (found) {
-            break;
-        }
-        second_match_addr += peerMemoryAtomSize;
-    }
-
-    nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    memoryAddressOffset = first_match_addr;
-}
+// void
+// CoalesceEngine::startup()
+// {
+//     return;
+    // std::cout << "Hello" << std::endl;
+    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
+    //                                 __func__, peerMemoryRange.to_string());
+    // AddrRangeList vertex_ranges = getAddrRanges();
+
+    // bool found = false;
+    // Addr first_match_addr = 0;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(first_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     first_match_addr += peerMemoryAtomSize;
+    // }
+
+    // found = false;
+    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
+    // while(true) {
+    //     for (auto range: vertex_ranges) {
+    //         if (range.contains(second_match_addr)) {
+    //             found = true;
+    //             break;
+    //         }
+    //     }
+    //     if (found) {
+    //         break;
+    //     }
+    //     second_match_addr += peerMemoryAtomSize;
+    // }
+
+    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
+    // memoryAddressOffset = first_match_addr;
+// }
 
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
@@ -117,7 +121,10 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    return ((int) (addr / peerMemoryAtomSize)) % numLines;
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
+                                __func__, addr, trimmed_addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -125,10 +132,10 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    int atom_index = (int) (addr / (peerMemoryAtomSize * nmpu));
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    int bit_index = atom_index * block_bits;
-    return bit_index;
+    return atom_index * block_bits;
 }
 
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
@@ -136,9 +143,8 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr block_addr = (nmpu * peerMemoryAtomSize) *
-        ((int)(index / (peerMemoryAtomSize / sizeof(WorkListItem))));
-    return (block_addr + memoryAddressOffset);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
 bool
@@ -149,7 +155,8 @@ CoalesceEngine::recvWLRead(Addr addr)
                                                     __func__, addr);
     Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
@@ -507,7 +514,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     Addr addr = pkt->getAddr();
-    int block_index = (addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(addr);
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
@@ -591,7 +599,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
-    int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
+    int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
 
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a322379b05..28b204e198 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -91,8 +91,8 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    int nmpu;
-    Addr memoryAddressOffset;
+    // int nmpu;
+    // Addr memoryAddressOffset;
 
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
@@ -174,7 +174,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     void recvPushRetry();
 
-    virtual void startup() override;
+    // virtual void startup() override;
 };
 
 }

From 6c9e7c8d4c68d72742a39a50918f4df35eaa663c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 25 Jul 2022 20:51:48 -0700
Subject: [PATCH 132/287] Refactoring PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/sega/CoalesceEngine.py     |   6 +-
 src/accl/graph/sega/PushEngine.py         |  15 ++-
 src/accl/graph/sega/WLEngine.py           |  11 +-
 src/accl/graph/sega/base_memory_engine.hh |  20 ++-
 src/accl/graph/sega/coalesce_engine.hh    |  14 ---
 src/accl/graph/sega/push_engine.cc        | 143 +++++++++++-----------
 src/accl/graph/sega/push_engine.hh        |  17 ++-
 src/accl/graph/sega/wl_engine.cc          |   2 +-
 8 files changed, 117 insertions(+), 111 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 536c3477ae..06c6f92750 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -36,7 +36,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
 
-    cache_size = Param.MemorySize("16KiB", "Size of the internal SRAM array.")
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int(4, "Number of MSHR entries.")
-    num_tgts_per_mshr = Param.Int(20, "Number of Targets Per MSHR.")
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index d3276799aa..447731219e 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -27,13 +27,20 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.BaseMemEngine import BaseMemEngine
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class PushEngine(BaseMemEngine):
+class PushEngine(BaseMemoryEngine):
     type = 'PushEngine'
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("")
-    push_req_queue_size = Param.Int(0, "")
+    base_edge_addr = Param.Addr("The base address for the "
+                                    "attached edge memory")
+    push_req_queue_size = Param.Int("Size of the queue to "
+                                    "queue push requests.")
+    # resp_queue_size should probably be
+    # significantly bigger than push_req_queue_size
+    resp_queue_size = Param.Int("Size of the response queue in the "
+                                    "push engine where it stores the "
+                                    "edges read from memory")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index cab47fbe7b..98089328f4 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -35,6 +35,11 @@ class WLEngine(BaseReduceEngine):
     cxx_class = 'gem5::WLEngine'
 
     resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "")
-    update_queue_size = Param.Int(0, "")
-    on_the_fly_update_map_size = Param.Int(4, "") # 4 is arbitrary
+    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
+                                    "this WLEngine is connected to.")
+    update_queue_size = Param.Int("Size of the queue WLEngine stores "
+                                        "the incoming updates")
+    register_file_size = Param.Int("Number of internal registers the "
+                                    "WLEngine has. It can service as "
+                                    "many updates as this queueu has "
+                                    "entries at the same time.") # 4 is arbitrary
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index efbfa5312d..5653ede698 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -43,7 +43,21 @@ namespace gem5
 
 class BaseMemoryEngine : public ClockedObject
 {
-  private:
+  protected:
+    class MemoryEvent : public EventFunctionWrapper
+    {
+      private:
+        bool _pending;
+      public:
+        MemoryEvent(const std::function<void(void)> &callback,
+                    const std::string &name):
+            EventFunctionWrapper(callback, name), _pending(false)
+        {}
+        bool pending() { return _pending; }
+        void sleep() { _pending = true; }
+        void wake() { _pending = false; }
+    };
+
     class MemPort : public RequestPort
     {
       private:
@@ -65,13 +79,11 @@ class BaseMemoryEngine : public ClockedObject
         virtual void recvReqRetry();
     };
 
-  protected:
     System* system;
     const RequestorID _requestorId;
 
-    AddrRange peerMemoryRange;
     MemPort memPort;
-
+    AddrRange peerMemoryRange;
     size_t peerMemoryAtomSize;
 
     virtual void recvMemRetry() = 0;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 28b204e198..b8cac15f5c 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,20 +47,6 @@ class WLEngine;
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
-    class MemoryEvent : public EventFunctionWrapper
-    {
-      private:
-        bool _pending;
-      public:
-        MemoryEvent(const std::function<void(void)> &callback,
-                    const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
-        {}
-        bool pending() { return _pending; }
-        void sleep() { _pending = true; }
-        void wake() { _pending = false; }
-    };
-
     struct Block
     {
         WorkListItem* items;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cfebf8e5df..d87462d7dd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -35,13 +35,15 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const PushEngineParams &params):
-    BaseMemEngine(params),
+PushEngine::PushEngine(const Params &params):
+    BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
     baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
-    nextAddrGenEvent([this] { processNextAddrGenEvent(); }, name()),
+    onTheFlyMemReqs(0),
+    memRespQueueSize(params.resp_queue_size),
+    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
     nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
@@ -52,10 +54,8 @@ PushEngine::getPort(const std::string &if_name, PortID idx)
 {
     if (if_name == "req_port") {
         return reqPort;
-    } else if (if_name == "mem_port") {
-        return BaseMemEngine::getPort(if_name, idx);
     } else {
-        return SimObject::getPort(if_name, idx);
+        return BaseMemoryEngine::getPort(if_name, idx);
     }
 }
 
@@ -98,9 +98,9 @@ PushEngine::ReqPort::recvReqRetry()
     if (!_blocked) {
         blockedPacket = nullptr;
         DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                    "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                    __func__, _blocked ? "true" : "false",
-                    (blockedPacket == nullptr) ? "true" : "false");
+                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
+                __func__, _blocked ? "true" : "false",
+                (blockedPacket == nullptr) ? "true" : "false");
     }
 }
 
@@ -149,14 +149,9 @@ PushEngine::recvWLItem(WorkListItem wl)
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -178,67 +173,68 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
                             __func__, pushReqQueue.size());
 
     numTotalRetries--;
-    if ((!nextAddrGenEvent.scheduled())) {
-        if (memQueueFull()) {
-            if (!pendingMemRetry()) {
-                requestMemRetry(1);
-            }
-        } else {
-            schedule(nextAddrGenEvent, nextCycle());
-        }
+    if ((!nextMemoryReadEvent.pending()) &&
+        (!nextMemoryReadEvent.scheduled())) {
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
 void
-PushEngine::processNextAddrGenEvent()
+PushEngine::processNextMemoryReadEvent()
 {
-    Addr aligned_addr, offset;
-    int num_edges;
-
-    PushPacketInfoGen &curr_info = pushReqQueue.front();
-    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    DPRINTF(PushEngine, "%s: Current packet information generated by "
-                "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
-                "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
-
-    PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-    reqOffsetMap[pkt->req] = offset;
-    reqNumEdgeMap[pkt->req] = num_edges;
-    reqValueMap[pkt->req] = curr_info.value();
-
-    enqueueMemReq(pkt);
-
-    if (curr_info.done()) {
-        DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
-        pushReqQueue.pop_front();
-        DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                    "pushReqQueue.size() = %u.\n",
-                    __func__, pushReqQueue.size());
-        if (numTotalRetries > 0) {
-            int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-            DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                        " free spaces.\n", __func__, free_space);
-            if ((free_space >= numElementsPerLine) &&
-                (numPendingRetries == 0)) {
-                DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-                if (!nextSendRetryEvent.scheduled()) {
-                    schedule(nextSendRetryEvent, nextCycle());
-                }
-            }
-        }
+    if (memPort.blocked()) {
+        nextMemoryReadEvent.sleep();
+        return;
     }
 
-    if (memQueueFull()) {
-        if (!pushReqQueue.empty()) {
-            requestMemRetry(1);
+    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+        Addr aligned_addr, offset;
+        int num_edges;
+
+        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+        DPRINTF(PushEngine, "%s: Current packet information generated by "
+                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
+
+        PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
+        reqOffsetMap[pkt->req] = offset;
+        reqNumEdgeMap[pkt->req] = num_edges;
+        reqValueMap[pkt->req] = curr_info.value();
+
+        memPort.sendPacket(pkt);
+        onTheFlyMemReqs++;
+
+        if (curr_info.done()) {
+            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            pushReqQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
+                        "pushReqQueue.size() = %u.\n",
+                        __func__, pushReqQueue.size());
+            if (numTotalRetries > 0) {
+                int free_space = pushReqQueueSize -
+                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
+                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
+                            " free spaces.\n", __func__, free_space);
+                if ((free_space >= numElementsPerLine) &&
+                    (numPendingRetries == 0)) {
+                    DPRINTF(PushEngine, "%s: Sent a push retry to "
+                                "peerCoalesceEngine.\n", __func__);
+                    if (!nextSendRetryEvent.scheduled()) {
+                        schedule(nextSendRetryEvent, nextCycle());
+                    }
+                }
+            }
         }
-        return;
     }
 
-    if ((!nextAddrGenEvent.scheduled()) && (!pushReqQueue.empty())) {
-        schedule(nextAddrGenEvent, nextCycle());
+    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
+        // schedule(nextMemoryReadEvent, nextCycle());
+    // }
+    if (!pushReqQueue.empty()) {
+        assert(!nextMemoryReadEvent.pending());
+        assert(!nextMemoryReadEvent.scheduled());
+        schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
@@ -253,9 +249,11 @@ PushEngine::processNextSendRetryEvent()
 void
 PushEngine::recvMemRetry()
 {
-    assert(!nextAddrGenEvent.scheduled());
-    DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
-    schedule(nextAddrGenEvent, nextCycle());
+    if (nextMemoryReadEvent.pending()) {
+        DPRINTF(PushEngine, "%s: Received a memory retry.\n", __func__);
+        nextMemoryReadEvent.wake();
+        schedule(nextMemoryReadEvent, nextCycle());
+    }
 }
 
 bool
@@ -264,7 +262,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
     memRespQueue.push_back(pkt);
-    assert(memRespQueue.size() <= respQueueSize);
+    onTheFlyMemReqs--;
+    assert(memRespQueue.size() <= memRespQueueSize);
 
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 378cd1a487..9b182e2251 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,7 +29,7 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/base/base_mem_engine.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
@@ -39,7 +39,7 @@ namespace gem5
 
 class CoalesceEngine;
 
-class PushEngine : public BaseMemEngine
+class PushEngine : public BaseMemoryEngine
 {
   private:
     class PushPacketInfoGen {
@@ -115,15 +115,14 @@ class PushEngine : public BaseMemEngine
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
 
-    // Since the push engine can process incoming packets faster than
-    // memory can send those packets, the size of this queue will
-    // always be limited by the b/w of the memory.
+    int onTheFlyMemReqs;
+    int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
-    EventFunctionWrapper nextAddrGenEvent;
-    void processNextAddrGenEvent();
+    MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent();
 
     EventFunctionWrapper nextPushEvent;
     void processNextPushEvent();
@@ -145,13 +144,12 @@ class PushEngine : public BaseMemEngine
     PushStats stats;
 
   protected:
-    virtual int respBuffSize() { return memRespQueue.size(); }
     virtual void recvMemRetry();
     virtual bool handleMemResp(PacketPtr pkt);
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const PushEngineParams &params);
+    PushEngine(const Params &params);
 
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
@@ -169,7 +167,6 @@ class PushEngine : public BaseMemEngine
 
     int getNumRetries() { return numTotalRetries; }
 
-    void recvRetryReject() { numPendingRetries--; }
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2d4ffc9cac..12f4548aa2 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -40,7 +40,7 @@ WLEngine::WLEngine(const WLEngineParams &params):
     respPort(name() + ".resp_port", this),
     coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
-    registerFileSize(params.on_the_fly_update_map_size),
+    registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)

From b7e76bfdb113a55311db67e0532495e958b4794b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:01:42 -0700
Subject: [PATCH 133/287] Refactored PushEngine to inherit from
 BaseMemoryEngine.

---
 src/accl/graph/SConscript                 |   4 +-
 src/accl/graph/base/BaseMemEngine.py      |  47 ---
 src/accl/graph/base/SConscript            |   3 -
 src/accl/graph/base/base_mem_engine.cc    | 225 --------------
 src/accl/graph/base/base_mem_engine.hh    | 125 --------
 src/accl/graph/sega/base_memory_engine.cc |   4 +
 src/accl/graph/sega/base_memory_engine.hh |   7 +-
 src/accl/graph/sega/coalesce_engine.cc    | 362 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh    |   9 +-
 9 files changed, 275 insertions(+), 511 deletions(-)
 delete mode 100644 src/accl/graph/base/BaseMemEngine.py
 delete mode 100644 src/accl/graph/base/base_mem_engine.cc
 delete mode 100644 src/accl/graph/base/base_mem_engine.hh

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
index 53c6411de6..5dffd1a396 100644
--- a/src/accl/graph/SConscript
+++ b/src/accl/graph/SConscript
@@ -28,5 +28,5 @@
 Import('*')
 
 DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine', 'WLEngine',
-                    'BaseMemEngine', 'BaseMemoryEngine'])
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/BaseMemEngine.py b/src/accl/graph/base/BaseMemEngine.py
deleted file mode 100644
index 2ecb6659d8..0000000000
--- a/src/accl/graph/base/BaseMemEngine.py
+++ /dev/null
@@ -1,47 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
-
-class BaseMemEngine(ClockedObject):
-    abstract = True
-    type = 'BaseMemEngine'
-    cxx_header = "accl/graph/base/base_mem_engine.hh"
-    cxx_class = 'gem5::BaseMemEngine'
-
-    system = Param.System(Parent.any, 'System this Engine is a part of')
-    mem_port  = RequestPort("Port to communicate with the memory")
-
-    outstanding_mem_req_queue_size = Param.Int(16, "Capacity of queue in "
-                                    "which memory requests are queued.")
-
-    attached_memory_atom_size = Param.Int(64, "The atom size of the attached "
-                                    "memory.")
-
-    resp_queue_size = Param.Int(64, "blah")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 45877a12ca..0e43d1aed8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -27,9 +27,6 @@
 
 Import('*')
 
-SimObject('BaseMemEngine.py')
 SimObject('BaseReduceEngine.py')
 
-Source('base_mem_engine.cc')
 Source('base_reduce_engine.cc')
-DebugFlag('BaseMemEngine')
diff --git a/src/accl/graph/base/base_mem_engine.cc b/src/accl/graph/base/base_mem_engine.cc
deleted file mode 100644
index 590307b2bc..0000000000
--- a/src/accl/graph/base/base_mem_engine.cc
+++ /dev/null
@@ -1,225 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/base/base_mem_engine.hh"
-
-#include "debug/BaseMemEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-
-namespace gem5
-{
-
-BaseMemEngine::BaseMemEngine(const BaseMemEngineParams &params):
-    ClockedObject(params),
-    system(params.system),
-    memPort(name() + ".mem_port", this),
-    memQueueSize(params.outstanding_mem_req_queue_size),
-    onTheFlyReqs(0),
-    memRetryRequested(false),
-    memSpaceRequested(0),
-    nextMemReqEvent([this] { processNextMemReqEvent(); }, name()),
-    respQueueSize(params.resp_queue_size),
-    _requestorId(system->getRequestorId(this)),
-    peerMemoryAtomSize(params.attached_memory_atom_size)
-{}
-
-BaseMemEngine::~BaseMemEngine()
-{}
-
-Port&
-BaseMemEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "mem_port") {
-        return memPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
-void
-BaseMemEngine::MemPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-BaseMemEngine::MemPort::recvTimingResp(PacketPtr pkt)
-{
-    //TODO: Investigate sending true all the time
-    return owner->recvTimingResp(pkt);
-}
-
-void
-BaseMemEngine::MemPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
-
-    owner->wakeUp();
-}
-
-void
-BaseMemEngine::processNextMemReqEvent()
-{
-    if ((respQueueSize == 0) ||
-        ((respBuffSize() + onTheFlyReqs) < respQueueSize)) {
-        PacketPtr pkt = memQueue.front();
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-        DPRINTF(BaseMemEngine, "%s: Sent a packet to memory with the following info. "
-                    "pkt->addr: %lu, pkt->size: %lu.\n",
-                    __func__, pkt->getAddr(), pkt->getSize());
-        memQueue.pop_front();
-        DPRINTF(SEGAStructureSize, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        DPRINTF(BaseMemEngine, "%s: Popped pkt: %s from "
-                "memQueue. memQueue.size = %d, memQueueSize = %d.\n",
-                __func__, pkt->print(), memQueue.size(), memQueueSize);
-        if (memRetryRequested &&
-            (memQueue.size() <=
-            (memQueueSize - memSpaceRequested))) {
-            memRetryRequested = false;
-            memSpaceRequested = 0;
-            recvMemRetry();
-        }
-    }
-
-    if ((!memPort.blocked()) &&
-        (!memQueue.empty()) && (!nextMemReqEvent.scheduled())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-PacketPtr
-BaseMemEngine::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
-}
-
-PacketPtr
-BaseMemEngine::createWritePacket(Addr addr, unsigned int size, uint8_t* data)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, _requestorId);
-
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::WriteReq);
-    pkt->allocate();
-    pkt->setData(data);
-
-    return pkt;
-}
-
-bool
-BaseMemEngine::allocateMemQueueSpace(int space)
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize == 0) ||
-        (memQueue.size() <= (memQueueSize - space))
-        );
-}
-
-bool
-BaseMemEngine::memQueueFull()
-{
-    assert((memQueueSize == 0) ||
-        (memQueue.size() <= memQueueSize));
-    return (
-        (memQueueSize != 0) &&
-        (memQueue.size() == memQueueSize));
-}
-
-void
-BaseMemEngine::enqueueMemReq(PacketPtr pkt)
-{
-    panic_if(memQueueFull(), "Should not enqueue if queue full.\n");
-    memQueue.push_back(pkt);
-    DPRINTF(SEGAStructureSize, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    DPRINTF(BaseMemEngine, "%s: Pushed pkt: %s to memQueue. "
-                "memQueue.size = %d, memQueueSize = %d.\n", __func__,
-                pkt->print(), memQueue.size(), memQueueSize);
-    if ((!nextMemReqEvent.scheduled()) && (!memPort.blocked())) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-void
-BaseMemEngine::requestMemRetry(int space) {
-    panic_if((memRetryRequested == true) || (memSpaceRequested != 0),
-            "You should not request another alarm without the first one being"
-            "responded to.\n");
-    DPRINTF(BaseMemEngine, "%s: Alarm requested with space = %d.\n", __func__, space);
-    memRetryRequested = true;
-    memSpaceRequested = space;
-}
-
-void
-BaseMemEngine::wakeUp()
-{
-    assert(!nextMemReqEvent.scheduled());
-    if (!memQueue.empty()) {
-        schedule(nextMemReqEvent, nextCycle());
-    }
-}
-
-bool
-BaseMemEngine::recvTimingResp(PacketPtr pkt)
-{
-    onTheFlyReqs--;
-    return handleMemResp(pkt);
-}
-
-}
diff --git a/src/accl/graph/base/base_mem_engine.hh b/src/accl/graph/base/base_mem_engine.hh
deleted file mode 100644
index 01c862d555..0000000000
--- a/src/accl/graph/base/base_mem_engine.hh
+++ /dev/null
@@ -1,125 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-#define __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
-
-#include <unordered_map>
-
-#include "base/addr_range.hh"
-#include "mem/packet.hh"
-#include "mem/port.hh"
-#include "params/BaseMemEngine.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
-
-namespace gem5
-{
-
-class BaseMemEngine : public ClockedObject
-{
-  private:
-    class MemPort : public RequestPort
-    {
-      private:
-        BaseMemEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-        public:
-        MemPort(const std::string& name, BaseMemEngine* owner):
-            RequestPort(name, owner), owner(owner),
-            _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-        protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    System* system;
-    MemPort memPort;
-
-    int memQueueSize;
-    int onTheFlyReqs;
-    bool memRetryRequested;
-    int memSpaceRequested;
-    std::deque<PacketPtr> memQueue;
-
-    EventFunctionWrapper nextMemReqEvent;
-    void processNextMemReqEvent();
-
-  protected:
-
-    int respQueueSize;
-    const RequestorID _requestorId;
-
-    size_t peerMemoryAtomSize;
-
-    bool allocateMemQueueSpace(int space);
-    bool memQueueFull();
-
-    bool pendingMemRetry() { return memRetryRequested; }
-    void requestMemRetry(int space);
-
-    void sendMemFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
-    void enqueueMemReq(PacketPtr pkt);
-
-    virtual int respBuffSize() = 0;
-    virtual void recvMemRetry() = 0;
-    virtual bool handleMemResp(PacketPtr pkt) = 0;
-
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
-
-  public:
-    PARAMS(BaseMemEngine);
-
-    BaseMemEngine(const BaseMemEngineParams &params);
-    ~BaseMemEngine();
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
-
-    RequestorID requestorId() { return _requestorId; }
-
-    AddrRangeList getAddrRanges() {return memPort.getAddrRanges(); }
-
-    bool recvTimingResp(PacketPtr pkt);
-    void recvFunctional(PacketPtr pkt);
-
-    void wakeUp();
-
-};
-
-}
-
-#endif // __ACCL_GRAPH_BASE_BASE_MEM_ENGINE_HH__
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9db95d6bd6..c60d189e0f 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -77,7 +77,11 @@ BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
+                                            __func__, blockedPacket->print());
     } else {
+        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
+                                                __func__, pkt->print());
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index 5653ede698..f336edcbf1 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -48,14 +48,19 @@ class BaseMemoryEngine : public ClockedObject
     {
       private:
         bool _pending;
+        int _prevState;
+
       public:
         MemoryEvent(const std::function<void(void)> &callback,
                     const std::string &name):
-            EventFunctionWrapper(callback, name), _pending(false)
+            EventFunctionWrapper(callback, name),
+            _pending(false), _prevState(0)
         {}
         bool pending() { return _pending; }
         void sleep() { _pending = true; }
         void wake() { _pending = false; }
+        void setPrevState(int state) { _prevState = state; }
+        int getPrevState() { return _prevState; }
     };
 
     class MemPort : public RequestPort
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21f048213a..daaed28f1c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -28,6 +28,8 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 
+#include <bitset>
+
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
@@ -53,7 +55,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
     nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
+    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -317,6 +319,10 @@ CoalesceEngine::processNextMemoryReadEvent()
         pendingEventQueue.push_back("nextMemoryReadEvent");
         // Maximum three MemoryEvents.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -366,11 +372,14 @@ CoalesceEngine::processNextRespondEvent()
 void
 CoalesceEngine::recvMemRetry()
 {
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
     if (pendingEventQueue.empty()) {
+        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
         return;
     }
 
     std::string front = pendingEventQueue.front();
+    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
 
     if (front == "nextMemoryReadEvent") {
         assert(!nextMemoryReadEvent.scheduled());
@@ -382,11 +391,11 @@ CoalesceEngine::recvMemRetry()
         assert(nextWriteBackEvent.pending());
         schedule(nextWriteBackEvent, nextCycle());
         nextWriteBackEvent.wake();
-    } else if (front == "nextSendRetryEvent") {
-        assert(!nextSendRetryEvent.scheduled());
-        assert(nextSendRetryEvent.pending());
-        schedule(nextSendRetryEvent, nextCycle());
-        nextSendRetryEvent.wake();
+    } else if (front == "nextRecvPushRetryEvent") {
+        assert(!nextRecvPushRetryEvent.scheduled());
+        assert(nextRecvPushRetryEvent.pending());
+        schedule(nextRecvPushRetryEvent, nextCycle());
+        nextRecvPushRetryEvent.wake();
     } else {
         panic("EVENT IS NOT RECOGNIZED.\n");
     }
@@ -642,14 +651,16 @@ CoalesceEngine::processNextApplyEvent()
     int block_index = applyQueue.front();
 
     if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid apply process. "
-                    "Therefore, ignoring the apply schedule.\n",
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
+                    "apply process. Therefore, ignoring the apply schedule.\n",
                     __func__, block_index);
         stats.falseApplySchedules++;
     } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. Therefore, no apply "
-                    "needed.\n", __func__, block_index);
+        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
+                    "Therefore, no apply needed.\n", __func__, block_index);
     } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
+                                                    __func__, block_index);
         for (int i = 0; i < numElementsPerLine; i++) {
             uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
             uint32_t new_prop = std::min(
@@ -683,8 +694,9 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].hasConflict){
         writeBackQueue.push_back(block_index);
         assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. writeBackQueue.size = %u.\n",
-                __func__, block_index, writeBackQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+                            "writeBackQueue.size = %u.\n", __func__,
+                                block_index, writeBackQueue.size());
     }
 
     applyQueue.pop_front();
@@ -710,6 +722,10 @@ CoalesceEngine::processNextWriteBackEvent()
         pendingEventQueue.push_back("nextWriteBackEvent");
         // Maximum three MemoryEvent.
         assert(pendingEventQueue.size() <= 3);
+        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+                                    "has been pushed to pendingEventQueue. "
+                                    "pendingEventQueue.size = %d.\n",
+                                    __func__, pendingEventQueue.size());
         return;
     }
 
@@ -774,121 +790,259 @@ void
 CoalesceEngine::recvPushRetry()
 {
     numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextSendRetryEvent.pending());
-    assert(!nextSendRetryEvent.scheduled());
+    assert(!nextRecvPushRetryEvent.pending());
+    assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextSendRetryEvent, nextCycle());
+    schedule(nextRecvPushRetryEvent, nextCycle());
 }
 
-void
-CoalesceEngine::processNextSendRetryEvent()
+// void
+// CoalesceEngine::processNextRecvPushRetryEvent()
+// {
+//     assert(!nextRecvPushRetryEvent.pending());
+//     assert(needsPush.count() != 0);
+
+//     Addr block_addr = 0;
+//     int block_index = 0;
+//     int it = 0;
+//     uint32_t slice = 0;
+//     bool hit_in_cache = false;
+
+//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             slice <<= 1;
+//             slice |= needsPush[it + i];
+//         }
+//         if (slice) {
+//             block_addr = getBlockAddrFromBitIndex(it);
+//             block_index = getBlockIndex(block_addr);
+//             if ((cacheBlocks[block_index].addr == block_addr) &&
+//                 (cacheBlocks[block_index].valid)) {
+//                 if (cacheBlocks[block_index].busyMask == 0) {
+//                     hit_in_cache = true;
+//                     break;
+//                 }
+//             } else {
+//                 hit_in_cache = false;
+//                 break;
+//             }
+//         }
+//     }
+
+//     assert(it < MAX_BITVECTOR_SIZE);
+//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
+//         currentBitSliceIndex = 0;
+//     } else {
+//         currentBitSliceIndex = it + numElementsPerLine;
+//     }
+
+//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
+//                         "in needsPush.\n", __func__, slice, it);
+
+//     if (hit_in_cache) {
+//         int push_needed = 0;
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             // TODO: Make this more programmable
+//             uint32_t new_prop = std::min(
+//                                 cacheBlocks[block_index].items[i].prop,
+//                                 cacheBlocks[block_index].items[i].tempProp);
+//             cacheBlocks[block_index].items[i].tempProp = new_prop;
+//             cacheBlocks[block_index].items[i].prop = new_prop;
+//             if (needsPush[it + i] == 1) {
+//                 peerPushEngine->recvWLItemRetry(
+//                     cacheBlocks[block_index].items[i]);
+//             }
+//             push_needed +=  needsPush[it + i];
+//             needsPush[it + i] = 0;
+//         }
+//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+//                                 __func__, needsPush.count());
+//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+//         assert(peerPushEngine->getNumRetries() == needsPush.count());
+//         if (applyQueue.find(block_index)) {
+//             applyQueue.erase(block_index);
+//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+//                 deschedule(nextApplyEvent);
+//             }
+//             if (cacheBlocks[block_index].hasConflict) {
+//                 writeBackQueue.push_back(block_index);
+//                 assert(writeBackQueue.size() <= numLines);
+//                 if ((!writeBackQueue.empty()) &&
+//                     (!nextWriteBackEvent.pending()) &&
+//                     (!nextWriteBackEvent.scheduled())) {
+//                     schedule(nextWriteBackEvent, nextCycle());
+//                 }
+//             }
+//         }
+//     } else {
+//         if (memPort.blocked()) {
+//             nextRecvPushRetryEvent.sleep();
+//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
+//             // Maximum three MemoryEvent.
+//             assert(pendingEventQueue.size() <= 3);
+//             return;
+//         }
+
+//         // FIXME: Fix the retry mechanism between memory and cache to
+//         // handle memory retries correctly. This probably requires scheduling
+//         // an event for sending the retry. For now we're enabling infinite
+//         // queueing in the memQueue.
+//         // FIXME: Also do not send requests for cache lines that are already
+//         // read but await data. Just set a flag or sth.
+//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
+//         SenderState* sender_state = new SenderState(true);
+//         pkt->pushSenderState(sender_state);
+//         memPort.sendPacket(pkt);
+//     }
+
+//     numRetriesReceived--;
+//     assert(numRetriesReceived == 0);
+//     assert(!nextRecvPushRetryEvent.scheduled());
+// }
+
+std::tuple<bool, int>
+CoalesceEngine::getOptimalBitVectorSlice()
 {
-    assert(!nextSendRetryEvent.pending());
-    assert(needsPush.count() != 0);
+    bool hit_in_cache;
+    int slice_base = -1;
 
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    Addr block_addr = 0;
-    int block_index = 0;
-    int it = 0;
-    uint32_t slice = 0;
-    bool hit_in_cache = false;
-
-    for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
+    int score = 0;
+    uint32_t current_popcount = 0;
+    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
+        int current_score = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
-            slice <<= 1;
-            slice |= needsPush[it + i];
+            current_popcount += needsPush[it + i];
         }
-        if (slice) {
-            block_addr = getBlockAddrFromBitIndex(it);
-            block_index = getBlockIndex(block_addr);
-            if ((cacheBlocks[block_index].addr == block_addr) &&
-                (cacheBlocks[block_index].valid)) {
-                if (cacheBlocks[block_index].busyMask == 0) {
-                    hit_in_cache = true;
-                    break;
-                }
-            } else {
+        if (current_popcount == 0) {
+            continue;
+        }
+        current_score += current_popcount;
+        Addr addr = getBlockAddrFromBitIndex(it);
+        int block_index = getBlockIndex(addr);
+        if ((cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].busyMask == 0)) {
+            current_score += numElementsPerLine * 2;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
+                hit_in_cache = true;
+            }
+        } else if (!((cacheBlocks[block_index].addr == addr) &&
+                    (cacheBlocks[block_index].allocated))) {
+            score += numElementsPerLine;
+            if (current_score > score) {
+                score = current_score;
+                slice_base = it;
                 hit_in_cache = false;
-                break;
             }
         }
     }
 
-    assert(it < MAX_BITVECTOR_SIZE);
-    if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-        currentBitSliceIndex = 0;
-    } else {
-        currentBitSliceIndex = it + numElementsPerLine;
-    }
+    return std::make_tuple(hit_in_cache, slice_base);
+}
+
+void
+CoalesceEngine::processNextRecvPushRetryEvent()
+{
+    bool hit_in_cache;
+    int slice_base;
+    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
 
-    DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-                        "in needsPush.\n", __func__, slice, it);
+    if (slice_base != -1) {
+        Addr addr = getBlockAddrFromBitIndex(slice_base);
+        int block_index = getBlockIndex(addr);
+        if (hit_in_cache) {
+            assert(cacheBlocks[block_index].valid);
+            assert(cacheBlocks[block_index].busyMask == 0);
+
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+                                        "its MemRetry.\n", __func__);
+                recvMemRetry();
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-    if (hit_in_cache) {
-        int push_needed = 0;
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            // TODO: Make this more programmable
-            uint32_t new_prop = std::min(
+            int push_needed = 0;
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+
+            for (int i = 0; i < numElementsPerLine; i++) {
+                // TODO: Make this more programmable
+                uint32_t new_prop = std::min(
                                 cacheBlocks[block_index].items[i].prop,
                                 cacheBlocks[block_index].items[i].tempProp);
-            cacheBlocks[block_index].items[i].tempProp = new_prop;
-            cacheBlocks[block_index].items[i].prop = new_prop;
-            if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(
-                    cacheBlocks[block_index].items[i]);
-            }
-            push_needed +=  needsPush[it + i];
-            needsPush[it + i] = 0;
-        }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        if (applyQueue.find(block_index)) {
-            applyQueue.erase(block_index);
-            if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                deschedule(nextApplyEvent);
+                cacheBlocks[block_index].items[i].tempProp = new_prop;
+                cacheBlocks[block_index].items[i].prop = new_prop;
+                if (needsPush[slice_base + i] == 1) {
+                    peerPushEngine->recvWLItemRetry(
+                        cacheBlocks[block_index].items[i]);
+                }
+                push_needed +=  needsPush[slice_base + i];
+                needsPush[slice_base + i] = 0;
             }
-            if (cacheBlocks[block_index].hasConflict) {
-                writeBackQueue.push_back(block_index);
-                assert(writeBackQueue.size() <= numLines);
-                if ((!writeBackQueue.empty()) &&
-                    (!nextWriteBackEvent.pending()) &&
-                    (!nextWriteBackEvent.scheduled())) {
-                    schedule(nextWriteBackEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                                    __func__, needsPush.count());
+            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            if (applyQueue.find(block_index)) {
+                applyQueue.erase(block_index);
+                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
+                    deschedule(nextApplyEvent);
+                }
+                if (cacheBlocks[block_index].hasConflict) {
+                    writeBackQueue.push_back(block_index);
+                    assert(writeBackQueue.size() <= numLines);
+                    if ((!nextWriteBackEvent.pending()) &&
+                        (!nextWriteBackEvent.scheduled())) {
+                        schedule(nextWriteBackEvent, nextCycle());
+                    }
                 }
             }
-        }
-    } else {
-        if (memPort.blocked()) {
-            nextSendRetryEvent.sleep();
-            pendingEventQueue.push_back("nextSendRetryEvent");
-            // Maximum three MemoryEvent.
-            assert(pendingEventQueue.size() <= 3);
-            return;
-        }
+        } else {
+            if (memPort.blocked()) {
+                assert(nextRecvPushRetryEvent.getPrevState() != -1);
+                nextRecvPushRetryEvent.setPrevState(-1);
+                nextRecvPushRetryEvent.sleep();
+                pendingEventQueue.push_back("nextRecvPushRetryEvent");
+                assert(pendingEventQueue.size() <= 3);
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+                                        "and has been pushed to pendingEventQueue."
+                                        " pendingEventQueue.size = %d.\n",
+                                        __func__, pendingEventQueue.size());
+                return;
+            }
+            // if nextRecvPushRetryEvent has been blocked by memory before
+            if (nextRecvPushRetryEvent.getPrevState() == -1) {
+                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
+                nextRecvPushRetryEvent.setPrevState(0);
+            }
 
-        // FIXME: Fix the retry mechanism between memory and cache to
-        // handle memory retries correctly. This probably requires scheduling
-        // an event for sending the retry. For now we're enabling infinite
-        // queueing in the memQueue.
-        // FIXME: Also do not send requests for cache lines that are already
-        // read but await data. Just set a flag or sth.
-        PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-        SenderState* sender_state = new SenderState(true);
-        pkt->pushSenderState(sender_state);
-        memPort.sendPacket(pkt);
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            SenderState* sender_state = new SenderState(true);
+            pkt->pushSenderState(sender_state);
+            memPort.sendPacket(pkt);
+            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
+            // It does not have to read this address anymore. It can simply set
+            // a flag to true (maybe not even needed just look if the cache has a
+            // line allocated for it in the cacheBlocks).
+        }
+        numRetriesReceived--;
+        assert(numRetriesReceived == 0);
+    }
+    if (numRetriesReceived > 0) {
+        schedule(nextRecvPushRetryEvent, nextCycle());
     }
-
-    numRetriesReceived--;
-    assert(numRetriesReceived == 0);
-    assert(!nextSendRetryEvent.scheduled());
 }
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b8cac15f5c..356fee0107 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,6 +106,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<bool, int> getOptimalBitVectorSlice();
 
     std::deque<std::string> pendingEventQueue;
 
@@ -121,8 +122,8 @@ class CoalesceEngine : public BaseMemoryEngine
     MemoryEvent nextWriteBackEvent;
     void processNextWriteBackEvent();
 
-    MemoryEvent nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -145,8 +146,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceStats stats;
 
   protected:
-    virtual void recvMemRetry();
-    virtual bool handleMemResp(PacketPtr pkt);
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
 
   public:
     PARAMS(CoalesceEngine);

From 0fc5c5efb512183db2b35cc30217555073973296 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 26 Jul 2022 09:49:11 -0700
Subject: [PATCH 134/287] Making bit vector smaller and choosing slices faster.

---
 src/accl/graph/sega/coalesce_engine.cc | 7 ++++++-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index daaed28f1c..f86d6877ad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -914,9 +914,10 @@ CoalesceEngine::getOptimalBitVectorSlice()
     int slice_base = -1;
 
     int score = 0;
-    uint32_t current_popcount = 0;
+    int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
         int current_score = 0;
+        uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
@@ -934,6 +935,9 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = true;
+                if (score == max_score_possible) {
+                    break;
+                }
             }
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
@@ -942,6 +946,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
                 score = current_score;
                 slice_base = it;
                 hit_in_cache = false;
+                assert(score < max_score_possible);
             }
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 356fee0107..f6ed4843fa 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -37,7 +37,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 30)
+#define MAX_BITVECTOR_SIZE (1 << 28)
 
 namespace gem5
 {

From ef61dcfccf1e22ea364b6ce13437c9ea9676fceb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 28 Jul 2022 06:36:15 -0700
Subject: [PATCH 135/287] Merging all memory interactions into one event.

---
 src/accl/graph/sega/coalesce_engine.cc | 559 +++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  24 +-
 2 files changed, 255 insertions(+), 328 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f86d6877ad..4d7107274b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,17 +45,15 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    currentBitSliceIndex(0),
-    numRetriesReceived(0),
-    applyQueue(numLines),
-    writeBackQueue(numLines),
-    nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
+    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
+    numRetriesReceived(0), applyQueue(numLines),
+    // writeBackQueue(numLines),
+    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
+    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextRespondEvent([this] { processNextRespondEvent(); }, name()),
     nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
+    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,49 +67,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     needsPush.reset();
 }
 
-// void
-// CoalesceEngine::startup()
-// {
-//     return;
-    // std::cout << "Hello" << std::endl;
-    // DPRINTF(CoalesceEngine, "%s: Range attached to this engine is %s.\n",
-    //                                 __func__, peerMemoryRange.to_string());
-    // AddrRangeList vertex_ranges = getAddrRanges();
-
-    // bool found = false;
-    // Addr first_match_addr = 0;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(first_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     first_match_addr += peerMemoryAtomSize;
-    // }
-
-    // found = false;
-    // Addr second_match_addr = first_match_addr + peerMemoryAtomSize;
-    // while(true) {
-    //     for (auto range: vertex_ranges) {
-    //         if (range.contains(second_match_addr)) {
-    //             found = true;
-    //             break;
-    //         }
-    //     }
-    //     if (found) {
-    //         break;
-    //     }
-    //     second_match_addr += peerMemoryAtomSize;
-    // }
-
-    // nmpu = (int) ((second_match_addr - first_match_addr) / peerMemoryAtomSize);
-    // memoryAddressOffset = first_match_addr;
-// }
-
 void
 CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
 {
@@ -260,15 +215,20 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
 
-                    fillQueue.push_back(block_index);
-                    assert(fillQueue.size() <= numLines);
+                    // fillQueue.push_back(block_index);
+                    // assert(fillQueue.size() <= numLines);
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
                     // FIXME: Fix this DPRINTF
                     // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
                     //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
                     //         __func__, fillQueue.size());
-                    if ((!nextMemoryReadEvent.pending()) &&
-                        (!nextMemoryReadEvent.scheduled())) {
-                        schedule(nextMemoryReadEvent, nextCycle());
+                    // if ((!nextMemoryReadEvent.pending()) &&
+                    //     (!nextMemoryReadEvent.scheduled())) {
+                    //     schedule(nextMemoryReadEvent, nextCycle());
+                    // }
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
                     }
                     stats.readMisses++;
                     stats.numVertexReads++;
@@ -309,24 +269,24 @@ CoalesceEngine::recvWLRead(Addr addr)
 }
 
 void
-CoalesceEngine::processNextMemoryReadEvent()
+CoalesceEngine::processNextMemoryReadEvent(int block_index)
 {
-    assert(!nextMemoryReadEvent.pending());
-    if (memPort.blocked()) {
-        // TODO: Implement interface where events of the CoalesceEngine are
-        // pushed to a fifo to be scheduled later.
-        nextMemoryReadEvent.sleep();
-        pendingEventQueue.push_back("nextMemoryReadEvent");
-        // Maximum three MemoryEvents.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextMemoryReadEvent.pending());
+    // if (memPort.blocked()) {
+    //     // TODO: Implement interface where events of the CoalesceEngine are
+    //     // pushed to a fifo to be scheduled later.
+    //     nextMemoryReadEvent.sleep();
+    //     pendingEventQueue.push_back("nextMemoryReadEvent");
+    //     // Maximum three MemoryEvents.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = fillQueue.front();
+    // int block_index = fillQueue.front();
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -334,13 +294,11 @@ CoalesceEngine::processNextMemoryReadEvent()
 
     memPort.sendPacket(pkt);
 
-    fillQueue.pop_front();
+    // fillQueue.pop_front();
 
-    if (!fillQueue.empty()) {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(!nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-    }
+    // if (!fillQueue.empty()) {
+    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
+    // }
 }
 
 // TODO: For loop to empty the entire responseQueue.
@@ -370,38 +328,70 @@ CoalesceEngine::processNextRespondEvent()
 }
 
 void
-CoalesceEngine::recvMemRetry()
+CoalesceEngine::processNextMemoryEvent()
 {
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    if (pendingEventQueue.empty()) {
-        DPRINTF(CoalesceEngine, "%s: No events pending.\n", __func__);
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
         return;
     }
 
-    std::string front = pendingEventQueue.front();
-    DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    if (front == "nextMemoryReadEvent") {
-        assert(!nextMemoryReadEvent.scheduled());
-        assert(nextMemoryReadEvent.pending());
-        schedule(nextMemoryReadEvent, nextCycle());
-        nextMemoryReadEvent.wake();
-    } else if (front == "nextWriteBackEvent") {
-        assert(!nextWriteBackEvent.scheduled());
-        assert(nextWriteBackEvent.pending());
-        schedule(nextWriteBackEvent, nextCycle());
-        nextWriteBackEvent.wake();
-    } else if (front == "nextRecvPushRetryEvent") {
-        assert(!nextRecvPushRetryEvent.scheduled());
-        assert(nextRecvPushRetryEvent.pending());
-        schedule(nextRecvPushRetryEvent, nextCycle());
-        nextRecvPushRetryEvent.wake();
-    } else {
-        panic("EVENT IS NOT RECOGNIZED.\n");
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
+}
 
-    pendingEventQueue.pop_front();
-    return;
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+    // if (pendingEventQueue.empty()) {
+    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+    //     return;
+    // }
+
+    // std::string front = pendingEventQueue.front();
+    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
+
+    // if (front == "nextMemoryReadEvent") {
+    //     assert(!nextMemoryReadEvent.scheduled());
+    //     assert(nextMemoryReadEvent.pending());
+    //     schedule(nextMemoryReadEvent, nextCycle());
+    //     nextMemoryReadEvent.wake();
+    // } else if (front == "nextWriteBackEvent") {
+    //     assert(!nextWriteBackEvent.scheduled());
+    //     assert(nextWriteBackEvent.pending());
+    //     schedule(nextWriteBackEvent, nextCycle());
+    //     nextWriteBackEvent.wake();
+    // } else if (front == "nextRecvPushRetryEvent") {
+    //     assert(!nextRecvPushRetryEvent.scheduled());
+    //     assert(nextRecvPushRetryEvent.pending());
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    //     nextRecvPushRetryEvent.wake();
+    // } else {
+    //     panic("EVENT IS NOT RECOGNIZED.\n");
+    // }
+
+    // pendingEventQueue.pop_front();
+    // return;
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
 }
 
 // FIXME: Fix this function.
@@ -464,12 +454,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         deschedule(nextApplyEvent);
                     }
                     if (cacheBlocks[block_index].hasConflict) {
-                        writeBackQueue.push_back(block_index);
-                        assert(writeBackQueue.size() <= numLines);
-                        if ((!nextWriteBackEvent.pending()) &&
-                            (!nextWriteBackEvent.scheduled())) {
-                            schedule(nextWriteBackEvent, nextCycle());
-                        }
+                        // writeBackQueue.push_back(block_index);
+                        // assert(writeBackQueue.size() <= numLines);
+                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                        // if ((!nextWriteBackEvent.pending()) &&
+                        //     (!nextWriteBackEvent.scheduled())) {
+                        //     schedule(nextWriteBackEvent, nextCycle());
+                        // }
+                        // if ((!nextMemoryEvent.pending()) &&
+                        //     (!nextMemoryEvent.scheduled())) {
+                        //     schedule(nextMemoryEvent, nextCycle());
+                        // }
                     }
                 }
             } else {
@@ -528,9 +523,12 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
     DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
                 __func__, pkt->getAddr());
-    assert((cacheBlocks[block_index].allocated) && // allocated cache block
-            (!cacheBlocks[block_index].valid) &&    // valid is false
-            (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
+    //         (!cacheBlocks[block_index].valid) &&    // valid is false
+    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
+    assert(cacheBlocks[block_index].allocated);
+    assert(!cacheBlocks[block_index].valid);
+    assert(MSHR.find(block_index) != MSHR.end());
     pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
 
@@ -691,22 +689,21 @@ CoalesceEngine::processNextApplyEvent()
     }
 
     // TODO: This is where eviction policy goes
-    if (cacheBlocks[block_index].hasConflict){
-        writeBackQueue.push_back(block_index);
-        assert(writeBackQueue.size() <= numLines);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-                            "writeBackQueue.size = %u.\n", __func__,
-                                block_index, writeBackQueue.size());
+    if ((cacheBlocks[block_index].hasConflict) &&
+        (cacheBlocks[block_index].busyMask == 0)) {
+        // writeBackQueue.push_back(block_index);
+        // assert(writeBackQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
+        //                     "writeBackQueue.size = %u.\n", __func__,
+        //                         block_index, writeBackQueue.size());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 
     applyQueue.pop_front();
-
-    if ((!writeBackQueue.empty()) &&
-        (!nextWriteBackEvent.pending()) &&
-        (!nextWriteBackEvent.scheduled())) {
-        schedule(nextWriteBackEvent, nextCycle());
-    }
-
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
@@ -714,22 +711,22 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent()
+CoalesceEngine::processNextWriteBackEvent(int block_index)
 {
-    assert(!nextWriteBackEvent.pending());
-    if (memPort.blocked()) {
-        nextWriteBackEvent.sleep();
-        pendingEventQueue.push_back("nextWriteBackEvent");
-        // Maximum three MemoryEvent.
-        assert(pendingEventQueue.size() <= 3);
-        DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-                                    "has been pushed to pendingEventQueue. "
-                                    "pendingEventQueue.size = %d.\n",
-                                    __func__, pendingEventQueue.size());
-        return;
-    }
+    // assert(!nextWriteBackEvent.pending());
+    // if (memPort.blocked()) {
+    //     nextWriteBackEvent.sleep();
+    //     pendingEventQueue.push_back("nextWriteBackEvent");
+    //     // Maximum three MemoryEvent.
+    //     assert(pendingEventQueue.size() <= 3);
+    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
+    //                                 "has been pushed to pendingEventQueue. "
+    //                                 "pendingEventQueue.size = %d.\n",
+    //                                 __func__, pendingEventQueue.size());
+    //     return;
+    // }
 
-    int block_index = writeBackQueue.front();
+    // int block_index = writeBackQueue.front();
 
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
@@ -769,21 +766,35 @@ CoalesceEngine::processNextWriteBackEvent()
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        fillQueue.push_back(block_index);
-        assert(fillQueue.size() <= numLines);
-        if ((!nextMemoryReadEvent.pending()) &&
-            (!nextMemoryReadEvent.scheduled())){
-            schedule(nextMemoryReadEvent, nextCycle());
-        }
+        // fillQueue.push_back(block_index);
+        // assert(fillQueue.size() <= numLines);
+        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
+        // if ((!nextMemoryReadEvent.pending()) &&
+        //     (!nextMemoryReadEvent.scheduled())){
+        //     schedule(nextMemoryReadEvent, nextCycle());
+        // }
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 
-    writeBackQueue.pop_front();
-
-    if (!writeBackQueue.empty()) {
-        assert(!nextWriteBackEvent.pending());
-        assert(!nextWriteBackEvent.scheduled());
-        schedule(nextWriteBackEvent, nextCycle());
-    }
+    // writeBackQueue.pop_front();
+    // assert(writeBackQueue.size() <= numLines);
+    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
+    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
+    //                 __func__, block_index, writeBackQueue.size(), numLines);
+
+    // if (!writeBackQueue.empty()) {
+        // assert(!nextWriteBackEvent.pending());
+        // assert(!nextWriteBackEvent.scheduled());
+        // schedule(nextWriteBackEvent, nextCycle());
+        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
+    // }
 }
 
 void
@@ -793,130 +804,28 @@ CoalesceEngine::recvPushRetry()
     DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
     // For now since we do only one retry at a time, we should not receive
     // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(!nextRecvPushRetryEvent.pending());
-    assert(!nextRecvPushRetryEvent.scheduled());
+    // assert(!nextRecvPushRetryEvent.pending());
+    // assert(!nextRecvPushRetryEvent.scheduled());
     assert(numRetriesReceived == 1);
-    schedule(nextRecvPushRetryEvent, nextCycle());
+    // schedule(nextRecvPushRetryEvent, nextCycle());
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
 }
 
-// void
-// CoalesceEngine::processNextRecvPushRetryEvent()
-// {
-//     assert(!nextRecvPushRetryEvent.pending());
-//     assert(needsPush.count() != 0);
-
-//     Addr block_addr = 0;
-//     int block_index = 0;
-//     int it = 0;
-//     uint32_t slice = 0;
-//     bool hit_in_cache = false;
-
-//     for (it = currentBitSliceIndex; it < MAX_BITVECTOR_SIZE;
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE) {
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             slice <<= 1;
-//             slice |= needsPush[it + i];
-//         }
-//         if (slice) {
-//             block_addr = getBlockAddrFromBitIndex(it);
-//             block_index = getBlockIndex(block_addr);
-//             if ((cacheBlocks[block_index].addr == block_addr) &&
-//                 (cacheBlocks[block_index].valid)) {
-//                 if (cacheBlocks[block_index].busyMask == 0) {
-//                     hit_in_cache = true;
-//                     break;
-//                 }
-//             } else {
-//                 hit_in_cache = false;
-//                 break;
-//             }
-//         }
-//     }
-
-//     assert(it < MAX_BITVECTOR_SIZE);
-//     if ((it + numElementsPerLine) > MAX_BITVECTOR_SIZE) {
-//         currentBitSliceIndex = 0;
-//     } else {
-//         currentBitSliceIndex = it + numElementsPerLine;
-//     }
-
-//     DPRINTF(CoalesceEngine, "%s: Found slice with value %d at position %d "
-//                         "in needsPush.\n", __func__, slice, it);
-
-//     if (hit_in_cache) {
-//         int push_needed = 0;
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             // TODO: Make this more programmable
-//             uint32_t new_prop = std::min(
-//                                 cacheBlocks[block_index].items[i].prop,
-//                                 cacheBlocks[block_index].items[i].tempProp);
-//             cacheBlocks[block_index].items[i].tempProp = new_prop;
-//             cacheBlocks[block_index].items[i].prop = new_prop;
-//             if (needsPush[it + i] == 1) {
-//                 peerPushEngine->recvWLItemRetry(
-//                     cacheBlocks[block_index].items[i]);
-//             }
-//             push_needed +=  needsPush[it + i];
-//             needsPush[it + i] = 0;
-//         }
-//         DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-//                                 __func__, needsPush.count());
-//         peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
-//         assert(peerPushEngine->getNumRetries() == needsPush.count());
-//         if (applyQueue.find(block_index)) {
-//             applyQueue.erase(block_index);
-//             if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-//                 deschedule(nextApplyEvent);
-//             }
-//             if (cacheBlocks[block_index].hasConflict) {
-//                 writeBackQueue.push_back(block_index);
-//                 assert(writeBackQueue.size() <= numLines);
-//                 if ((!writeBackQueue.empty()) &&
-//                     (!nextWriteBackEvent.pending()) &&
-//                     (!nextWriteBackEvent.scheduled())) {
-//                     schedule(nextWriteBackEvent, nextCycle());
-//                 }
-//             }
-//         }
-//     } else {
-//         if (memPort.blocked()) {
-//             nextRecvPushRetryEvent.sleep();
-//             pendingEventQueue.push_back("nextRecvPushRetryEvent");
-//             // Maximum three MemoryEvent.
-//             assert(pendingEventQueue.size() <= 3);
-//             return;
-//         }
-
-//         // FIXME: Fix the retry mechanism between memory and cache to
-//         // handle memory retries correctly. This probably requires scheduling
-//         // an event for sending the retry. For now we're enabling infinite
-//         // queueing in the memQueue.
-//         // FIXME: Also do not send requests for cache lines that are already
-//         // read but await data. Just set a flag or sth.
-//         PacketPtr pkt = createReadPacket(block_addr, peerMemoryAtomSize);
-//         SenderState* sender_state = new SenderState(true);
-//         pkt->pushSenderState(sender_state);
-//         memPort.sendPacket(pkt);
-//     }
-
-//     numRetriesReceived--;
-//     assert(numRetriesReceived == 0);
-//     assert(!nextRecvPushRetryEvent.scheduled());
-// }
-
 std::tuple<bool, int>
 CoalesceEngine::getOptimalBitVectorSlice()
 {
-    bool hit_in_cache;
+    bool hit_in_cache = false;
     int slice_base = -1;
 
-    int score = 0;
-    int max_score_possible = 3 * numElementsPerLine;
+    // int score = 0;
+    // int max_score_possible = 3 * numElementsPerLine;
     for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        int current_score = 0;
+        // int current_score = 0;
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
@@ -924,30 +833,32 @@ CoalesceEngine::getOptimalBitVectorSlice()
         if (current_popcount == 0) {
             continue;
         }
-        current_score += current_popcount;
+        // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
         if ((cacheBlocks[block_index].valid) &&
             (cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].busyMask == 0)) {
-            current_score += numElementsPerLine * 2;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = true;
-                if (score == max_score_possible) {
-                    break;
-                }
-            }
+            // current_score += numElementsPerLine * 2;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = true;
+            //     if (score == max_score_possible) {
+            //         break;
+            //     }
+            // }
+            return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
                     (cacheBlocks[block_index].allocated))) {
-            score += numElementsPerLine;
-            if (current_score > score) {
-                score = current_score;
-                slice_base = it;
-                hit_in_cache = false;
-                assert(score < max_score_possible);
-            }
+            // score += numElementsPerLine;
+            // if (current_score > score) {
+            //     score = current_score;
+            //     slice_base = it;
+            //     hit_in_cache = false;
+            //     assert(score < max_score_possible);
+            // }
+            return std::make_tuple(false, it);
         }
     }
 
@@ -955,11 +866,11 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent()
+CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base)= getOptimalBitVectorSlice();
+    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
 
     if (slice_base != -1) {
         Addr addr = getBlockAddrFromBitIndex(slice_base);
@@ -969,12 +880,12 @@ CoalesceEngine::processNextRecvPushRetryEvent()
             assert(cacheBlocks[block_index].busyMask == 0);
 
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-                                        "its MemRetry.\n", __func__);
-                recvMemRetry();
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
+            //                             "its MemRetry.\n", __func__);
+            //     recvMemRetry();
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
@@ -1005,33 +916,38 @@ CoalesceEngine::processNextRecvPushRetryEvent()
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    writeBackQueue.push_back(block_index);
-                    assert(writeBackQueue.size() <= numLines);
-                    if ((!nextWriteBackEvent.pending()) &&
-                        (!nextWriteBackEvent.scheduled())) {
-                        schedule(nextWriteBackEvent, nextCycle());
-                    }
+                    // writeBackQueue.push_back(block_index);
+                    // assert(writeBackQueue.size() <= numLines);
+                    // if ((!nextWriteBackEvent.pending()) &&
+                    //     (!nextWriteBackEvent.scheduled())) {
+                    //     schedule(nextWriteBackEvent, nextCycle());
+                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
+                    // if ((!nextMemoryEvent.pending()) &&
+                    //     (!nextMemoryEvent.scheduled())) {
+                    //     schedule(nextMemoryEvent, nextCycle());
+                    // }
                 }
             }
         } else {
-            if (memPort.blocked()) {
-                assert(nextRecvPushRetryEvent.getPrevState() != -1);
-                nextRecvPushRetryEvent.setPrevState(-1);
-                nextRecvPushRetryEvent.sleep();
-                pendingEventQueue.push_back("nextRecvPushRetryEvent");
-                assert(pendingEventQueue.size() <= 3);
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-                                        "and has been pushed to pendingEventQueue."
-                                        " pendingEventQueue.size = %d.\n",
-                                        __func__, pendingEventQueue.size());
-                return;
-            }
+            // if (memPort.blocked()) {
+            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
+            //     nextRecvPushRetryEvent.setPrevState(-1);
+            //     nextRecvPushRetryEvent.sleep();
+            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
+            //     assert(pendingEventQueue.size() <= 3);
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
+            //                             "and has been pushed to pendingEventQueue."
+            //                             " pendingEventQueue.size = %d.\n",
+            //                             __func__, pendingEventQueue.size());
+            //     return;
+            // }
             // if nextRecvPushRetryEvent has been blocked by memory before
-            if (nextRecvPushRetryEvent.getPrevState() == -1) {
-                DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-                    "unblocked by memPort. Setting prevState to 0.\n", __func__);
-                nextRecvPushRetryEvent.setPrevState(0);
-            }
+            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
+            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
+            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
+            //     nextRecvPushRetryEvent.setPrevState(0);
+            // }
 
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -1045,8 +961,15 @@ CoalesceEngine::processNextRecvPushRetryEvent()
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
+    // if (numRetriesReceived > 0) {
+    //     schedule(nextRecvPushRetryEvent, nextCycle());
+    // }
     if (numRetriesReceived > 0) {
-        schedule(nextRecvPushRetryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
+        // if ((!nextMemoryEvent.pending()) &&
+        //     (!nextMemoryEvent.scheduled())) {
+        //     schedule(nextMemoryEvent, nextCycle());
+        // }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f6ed4843fa..4036dc49af 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -92,26 +92,30 @@ class CoalesceEngine : public BaseMemoryEngine
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
-    std::deque<int> fillQueue;
+    // std::deque<int> fillQueue;
 
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int currentBitSliceIndex;
     int numRetriesReceived;
     InOutSet<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    InOutSet<int> writeBackQueue;
+    // InOutSet<int> writeBackQueue;
+
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    std::deque<std::string> pendingEventQueue;
+    // std::deque<std::string> pendingEventQueue;
+
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
 
-    MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent();
+    // MemoryEvent nextMemoryReadEvent;
+    void processNextMemoryReadEvent(int block_index);
 
     EventFunctionWrapper nextRespondEvent;
     void processNextRespondEvent();
@@ -119,11 +123,11 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent();
+    // MemoryEvent nextWriteBackEvent;
+    void processNextWriteBackEvent(int block_index);
 
-    MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent();
+    // MemoryEvent nextRecvPushRetryEvent;
+    void processNextRecvPushRetryEvent(int slice_base);
 
     struct CoalesceStats : public statistics::Group
     {

From d00c61008d8ee2157b711441cd71a34ab32bb108 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 10:59:33 -0700
Subject: [PATCH 136/287] Adding more dprintfs.

---
 src/accl/graph/base/data_structs.hh       |  36 +-
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    | 676 ++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh    |  36 +-
 4 files changed, 275 insertions(+), 481 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f178d5a7e2..707b57c56f 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,7 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <bitset>
-#include <queue>
-#include <unordered_set>
+#include <list>
 
 namespace gem5
 {
@@ -90,49 +88,51 @@ static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
 template<typename T>
-class InOutSet
+class UniqueFIFO
 {
   private:
-    std::unordered_set<T> set;
+    std::list<T> fifo;
 
   public:
-    InOutSet(int cap)
-    {
-        set.reserve(cap);
-    }
+    UniqueFIFO() {}
 
     void push_back(T item)
     {
-        if (set.find(item) == set.end()) {
-            set.insert(item);
+        if (!find(item)) {
+            fifo.push_back(item);
         }
     }
 
     void pop_front()
     {
-        assert(set.begin() != set.end());
-        set.erase(set.begin());
+        assert(!fifo.empty());
+        fifo.pop_front();
     }
 
     T front()
     {
-        return *(set.begin());
+        return fifo.front();
     }
 
     size_t size() {
-        return set.size();
+        return fifo.size();
     }
 
     bool empty() {
-        return (size() == 0);
+        return fifo.empty();
     }
 
     bool find(T item) {
-        return (set.find(item) != set.end());
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        return (it != fifo.end());
     }
 
     void erase(T item) {
-        set.erase(item);
+        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
+        auto it = std::find(fifo.begin(), fifo.end(), item);
+        assert(it != fifo.end());
+        fifo.erase(it);
     }
 };
 
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index c60d189e0f..a5d1d7e8e7 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -73,15 +73,15 @@ void
 BaseMemoryEngine::MemPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
+    DPRINTF(BaseMemoryEngine, "%s: Sending pakcet: %s to "
+                "the memory.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
-        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked. blockedPacket %s.\n",
-                                            __func__, blockedPacket->print());
+        DPRINTF(BaseMemoryEngine, "%s: MemPort blocked.\n", __func__);
     } else {
-        DPRINTF(BaseMemoryEngine, "%s: Packet %s sent successfully.\n",
-                                                __func__, pkt->print());
+        DPRINTF(BaseMemoryEngine, "%s: Packet sent successfully.\n", __func__);
         owner->recvMemRetry();
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4d7107274b..6ed94fe938 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,14 +46,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0), applyQueue(numLines),
-    // writeBackQueue(numLines),
-    nextMemoryEvent([this] { processNextMemoryEvent(); }, name()),
-    // nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextRespondEvent([this] { processNextRespondEvent(); }, name()),
-    nextApplyEvent([this] { processNextApplyEvent(); }, name()),
-    // nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name()),
-    // nextRecvPushRetryEvent([this] { processNextRecvPushRetryEvent(); }, name()),
+    numRetriesReceived(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -79,8 +81,6 @@ CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    DPRINTF(CoalesceEngine, "%s: Trimming addr: %lu to %lu.\n",
-                                __func__, addr, trimmed_addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
@@ -108,21 +108,25 @@ bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
     assert(MSHR.size() <= numMSHREntries);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for address: %lu.\n",
-                                                    __func__, addr);
-    Addr aligned_addr = (addr / peerMemoryAtomSize) * peerMemoryAtomSize;
+
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         // Hit
         // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextRespondEvent for latency cycles in
+        // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -138,12 +142,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                         cacheBlocks[block_index].items[wl_offset].to_string(),
                         responseQueue.size(),
                         peerWLEngine->getRegisterFileSize());
-        // TODO: Add a stat to count the number of WLItems that have been touched.
+        // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         stats.readHits++;
 
-        if (!nextRespondEvent.scheduled()) {
-            schedule(nextRespondEvent, nextCycle());
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
@@ -151,44 +155,50 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu not "
-                        "found in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
             assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                            "Rejecting request.\n", __func__);
+                                "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
                 stats.readRejections++;
                 return false;
             } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR entries available.\n", __func__);
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
                 if (cacheBlocks[block_index].allocated) {
                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                                    "Rejecting request.\n",
+                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                                    "cacheBlocks[%d]. Rejecting request.\n",
                                     __func__, block_index);
                         stats.readRejections++;
                         return false;
                     }
                     cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                                "line[%d].\n", __func__, addr, block_index);
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
                     stats.numVertexReads++;
-
                     if ((cacheBlocks[block_index].busyMask == 0) &&
                         (cacheBlocks[block_index].valid)) {
-                        applyQueue.push_back(block_index);
-                        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. "
-                                    "applyQueue.size = %u.\n", __func__,
-                                    block_index, applyQueue.size());
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+                                            "busy. It %s in the applyQueue.\n",
+                                            __func__, block_index,
+                            applyQueue.find(block_index) ? "is" : "is not");
+                        if (!applyQueue.find(block_index)) {
+                            applyQueue.push_back(block_index);
+                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
+                                        "applyQueue. applyQueue.size = %u.\n",
+                                    __func__, block_index, applyQueue.size());
+                        }
                         assert(!applyQueue.empty());
                         if ((!nextApplyEvent.scheduled())) {
                             schedule(nextApplyEvent, nextCycle());
@@ -208,24 +218,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
                     cacheBlocks[block_index].hasConflict = false;
-                    DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for"
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
-
                     MSHR[block_index].push_back(addr);
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-
-                    // fillQueue.push_back(block_index);
-                    // assert(fillQueue.size() <= numLines);
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-                    // FIXME: Fix this DPRINTF
-                    // DPRINTF(CoalesceEngine,  "%s: Pushed pkt index  "
-                    //         "lineFillBuffer. lineFillBuffer.size = %d.\n",
-                    //         __func__, fillQueue.size());
-                    // if ((!nextMemoryReadEvent.pending()) &&
-                    //     (!nextMemoryReadEvent.scheduled())) {
-                    //     schedule(nextMemoryReadEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                     if ((!nextMemoryEvent.pending()) &&
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
@@ -236,21 +240,23 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
             }
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr: %lu already "
-                        "in MSHRs.\n", __func__, block_index, addr);
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for cacheBlocks[%d]. "
-                            "Rejecting request.\n",
-                            __func__, block_index);
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
                 stats.readRejections++;
                 return false;
             }
-            if ((!cacheBlocks[block_index].hasConflict) &&
-                (aligned_addr != cacheBlocks[block_index].addr)) {
+            if ((aligned_addr != cacheBlocks[block_index].addr)) {
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                             "with Addr: %lu.\n", __func__, addr,
                             cacheBlocks[block_index].addr);
                 cacheBlocks[block_index].hasConflict = true;
+            } else {
+                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
             }
 
             if (aligned_addr != cacheBlocks[block_index].addr) {
@@ -260,295 +266,88 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
 
             MSHR[block_index].push_back(addr);
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for cache "
-                            "line[%d].\n", __func__, addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
             return true;
         }
     }
 }
 
-void
-CoalesceEngine::processNextMemoryReadEvent(int block_index)
-{
-    // assert(!nextMemoryReadEvent.pending());
-    // if (memPort.blocked()) {
-    //     // TODO: Implement interface where events of the CoalesceEngine are
-    //     // pushed to a fifo to be scheduled later.
-    //     nextMemoryReadEvent.sleep();
-    //     pendingEventQueue.push_back("nextMemoryReadEvent");
-    //     // Maximum three MemoryEvents.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextMemoryReadEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = fillQueue.front();
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
-    memPort.sendPacket(pkt);
-
-    // fillQueue.pop_front();
-
-    // if (!fillQueue.empty()) {
-    //     memoryFunctionQueue.push_back([this] { processNextMemoryReadEvent(); });
-    // }
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextRespondEvent()
-{
-    Addr addr_response;
-    WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,  "%s: Sent WorkListItem: %s with Addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
-
-    if ((!nextRespondEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    std::function<void(int)> next_memory_function;
-    int next_memory_function_input;
-    std::tie(next_memory_function, next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
-    memoryFunctionQueue.pop_front();
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-    // if (pendingEventQueue.empty()) {
-    //     DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-    //     return;
-    // }
-
-    // std::string front = pendingEventQueue.front();
-    // DPRINTF(CoalesceEngine, "%s: %s is pending MemRetry.\n", __func__, front);
-
-    // if (front == "nextMemoryReadEvent") {
-    //     assert(!nextMemoryReadEvent.scheduled());
-    //     assert(nextMemoryReadEvent.pending());
-    //     schedule(nextMemoryReadEvent, nextCycle());
-    //     nextMemoryReadEvent.wake();
-    // } else if (front == "nextWriteBackEvent") {
-    //     assert(!nextWriteBackEvent.scheduled());
-    //     assert(nextWriteBackEvent.pending());
-    //     schedule(nextWriteBackEvent, nextCycle());
-    //     nextWriteBackEvent.wake();
-    // } else if (front == "nextRecvPushRetryEvent") {
-    //     assert(!nextRecvPushRetryEvent.scheduled());
-    //     assert(nextRecvPushRetryEvent.pending());
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    //     nextRecvPushRetryEvent.wake();
-    // } else {
-    //     panic("EVENT IS NOT RECOGNIZED.\n");
-    // }
-
-    // pendingEventQueue.pop_front();
-    // return;
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-// FIXME: Fix this function.
 bool
 CoalesceEngine::handleMemResp(PacketPtr pkt)
 {
     assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
     if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        DPRINTF(CoalesceEngine,  "%s: Received a write response for Addr: %lu. Dropping "
-                    "the packet.\n", __func__, pkt->getAddr());
         return true;
     }
 
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+
     if (pkt->findNextSenderState<SenderState>()) {
-        Addr addr = pkt->getAddr();
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
         int it = getBitIndexBase(addr);
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            // We read the address to send the wl but it is put in cache before
-            // the read response arrives.
-            if (cacheBlocks[block_index].busyMask == 0) {
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as idle.\n",
-                        __func__, addr);
-                int push_needed = 0;
-                // It is not busy anymore, we have to send the wl from cache.
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                for (int i = 0; i < numElementsPerLine; i++) {
-                    assert(!((needsPush[it + i] == 1) &&
-                            (cacheBlocks[block_index].items[i].degree == 0)));
-                    // TODO: Make this more programmable
-                    uint32_t new_prop = std::min(
-                                        cacheBlocks[block_index].items[i].prop,
-                                        cacheBlocks[block_index].items[i].tempProp);
-                    cacheBlocks[block_index].items[i].tempProp = new_prop;
-                    cacheBlocks[block_index].items[i].prop = new_prop;
-                    if (needsPush[it + i] == 1) {
-                        peerPushEngine->recvWLItemRetry(
-                            cacheBlocks[block_index].items[i]);
-                    }
-                    push_needed += needsPush[it + i];
-                    needsPush[it + i] = 0;
-                }
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                peerPushEngine->deallocatePushSpace(
-                                        numElementsPerLine - push_needed);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                // Since we have just applied the line, we can take it out of
-                // the applyQueue if it's in there. No need to do the same
-                // thing for evictQueue.
-                if (applyQueue.find(block_index)) {
-                    applyQueue.erase(block_index);
-                    if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                        deschedule(nextApplyEvent);
-                    }
-                    if (cacheBlocks[block_index].hasConflict) {
-                        // writeBackQueue.push_back(block_index);
-                        // assert(writeBackQueue.size() <= numLines);
-                        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                        // if ((!nextWriteBackEvent.pending()) &&
-                        //     (!nextWriteBackEvent.scheduled())) {
-                        //     schedule(nextWriteBackEvent, nextCycle());
-                        // }
-                        // if ((!nextMemoryEvent.pending()) &&
-                        //     (!nextMemoryEvent.scheduled())) {
-                        //     schedule(nextMemoryEvent, nextCycle());
-                        // }
-                    }
-                }
-            } else {
-                // The line is busy. Therefore, we have to disregard the data
-                // we received from the memory and also tell the push engine to
-                // deallocate the space it allocated for this retry. However,
-                // we still have to rememeber that these items need a retry.
-                // i.e. don't change needsPush, call recvWLItemRetry with
-                // do_push = false
-                DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was found in the cache as busy.\n",
-                        __func__, addr);
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                peerPushEngine->deallocatePushSpace(numElementsPerLine);
-                assert(peerPushEngine->getNumRetries() == needsPush.count());
-                DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            }
-        } else {
-            // We have read the address to send the wl and it is not in the
-            // cache. Simply send the items to the PushEngine.
-            DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                        "for addr %lu. It was not found in the cache.\n",
-                        __func__, addr);
-            WorkListItem* items = pkt->getPtr<WorkListItem>();
-            int push_needed = 0;
-            // No applying of the line needed.
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                assert(!((needsPush[it + i] == 1) &&
-                                (items[i].degree == 0)));
-                if (needsPush[it + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(items[i]);
-                }
-                push_needed += needsPush[it + i];
-                needsPush[it + i] = 0;
+        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
+                    "for addr %lu. It was not found in the cache.\n",
+                    __func__, addr);
+        WorkListItem* items = pkt->getPtr<WorkListItem>();
+        int push_needed = 0;
+        // No applying of the line needed.
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            assert(!((needsPush[it + i] == 1) &&
+                            (items[i].degree == 0)));
+            if (needsPush[it + i] == 1) {
+                peerPushEngine->recvWLItemRetry(items[i]);
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
-                                __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                    numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
+            push_needed += needsPush[it + i];
+            needsPush[it + i] = 0;
         }
-
+        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+                            __func__, needsPush.count());
+        peerPushEngine->deallocatePushSpace(
+                                numElementsPerLine - push_needed);
+        assert(peerPushEngine->getNumRetries() == needsPush.count());
+        // }
         delete pkt;
         return true;
     }
 
-    Addr addr = pkt->getAddr();
-    // int block_index = (addr / peerMemoryAtomSize) % numLines;
-    int block_index = getBlockIndex(addr);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a read resposne for Addr: %lu.\n",
-                __func__, pkt->getAddr());
-    // assert((cacheBlocks[block_index].allocated) && // allocated cache block
-    //         (!cacheBlocks[block_index].valid) &&    // valid is false
-    //         (!(MSHR.find(block_index) == MSHR.end()))); // allocated MSHR
-    assert(cacheBlocks[block_index].allocated);
-    assert(!cacheBlocks[block_index].valid);
-    assert(MSHR.find(block_index) != MSHR.end());
-    pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+    if (cacheBlocks[block_index].addr == addr) {
+        assert(cacheBlocks[block_index].allocated);
+        assert(!cacheBlocks[block_index].valid);
+        assert(MSHR.find(block_index) != MSHR.end());
+        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
-
-    for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n", __func__,
-                block_index, i, cacheBlocks[block_index].items[i].to_string());
+        for (int i = 0; i < numElementsPerLine; i++) {
+        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                            __func__, block_index, i,
+                            cacheBlocks[block_index].items[i].to_string());
+        }
+        cacheBlocks[block_index].valid = true;
+        delete pkt;
     }
-    cacheBlocks[block_index].valid = true;
-    delete pkt;
 
     // FIXME: Get rid of servicedIndices (maybe use an iterator)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for cacheBlocks[%d] could "
-                        "be serviced with the received packet.\n",
-                        __func__, miss_addr, block_index);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
@@ -567,10 +366,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             // End of the said block
-
             servicedIndices.push_back(i);
-            DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-                        "removal.\n", __func__, i, block_index);
+            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
+            //             "removal.\n", __func__, i, block_index);
         }
     }
 
@@ -593,19 +391,46 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(cacheBlocks[block_index].hasConflict);
     }
 
-    if ((!nextRespondEvent.scheduled()) &&
+    if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
-        schedule(nextRespondEvent, nextCycle());
+        schedule(nextResponseEvent, nextCycle());
     }
 
     return true;
 }
 
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    Addr addr_response;
+    WorkListItem worklist_response;
+
+    std::tie(addr_response, worklist_response) = responseQueue.front();
+    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    DPRINTF(CoalesceEngine,
+                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                __func__, worklist_response.to_string(), addr_response);
+
+    responseQueue.pop_front();
+    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
+                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
     // TODO: Parameterize all the numbers here.
-    Addr aligned_addr = roundDown<Addr, Addr>(addr, peerMemoryAtomSize);
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
@@ -691,12 +516,11 @@ CoalesceEngine::processNextApplyEvent()
     // TODO: This is where eviction policy goes
     if ((cacheBlocks[block_index].hasConflict) &&
         (cacheBlocks[block_index].busyMask == 0)) {
-        // writeBackQueue.push_back(block_index);
-        // assert(writeBackQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-        // DPRINTF(CoalesceEngine,  "%s: Added %d to writeBackQueue. "
-        //                     "writeBackQueue.size = %u.\n", __func__,
-        //                         block_index, writeBackQueue.size());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextWriteBack(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
+                        "to memoryFunctionQueue.\n", __func__, block_index);
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
@@ -711,23 +535,47 @@ CoalesceEngine::processNextApplyEvent()
 }
 
 void
-CoalesceEngine::processNextWriteBackEvent(int block_index)
+CoalesceEngine::processNextMemoryEvent()
 {
-    // assert(!nextWriteBackEvent.pending());
-    // if (memPort.blocked()) {
-    //     nextWriteBackEvent.sleep();
-    //     pendingEventQueue.push_back("nextWriteBackEvent");
-    //     // Maximum three MemoryEvent.
-    //     assert(pendingEventQueue.size() <= 3);
-    //     DPRINTF(CoalesceEngine, "%s: nextWriteBackEvent is asleep now and "
-    //                                 "has been pushed to pendingEventQueue. "
-    //                                 "pendingEventQueue.size = %d.\n",
-    //                                 __func__, pendingEventQueue.size());
-    //     return;
-    // }
-
-    // int block_index = writeBackQueue.front();
+    if (memPort.blocked()) {
+        nextMemoryEvent.sleep();
+        return;
+    }
 
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int)> next_memory_function;
+    int next_memory_function_input;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input);
+    memoryFunctionQueue.pop_front();
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index)
+{
+    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                    peerMemoryAtomSize);
+    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+
+    memPort.sendPacket(pkt);
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index)
+{
     // Why would we write it back if it does not have a conflict?
     assert(cacheBlocks[block_index].hasConflict);
 
@@ -749,6 +597,10 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                         "Addr: %lu, size = %d.\n", __func__,
                         write_pkt->getAddr(), write_pkt->getSize());
             memPort.sendPacket(write_pkt);
+        } else {
+            DPRINTF(CoalesceEngine, "%s: No change observed on "
+                            "cacheBlocks[%d]. No write back needed.\n",
+                                            __func__, block_index);
         }
         assert(!MSHR[block_index].empty());
         Addr miss_addr = MSHR[block_index].front();
@@ -756,7 +608,7 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
                                     "cacheBlocks[%d] is Addr: %lu.\n",
                                     __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, Addr>(miss_addr, peerMemoryAtomSize);
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
         cacheBlocks[block_index].busyMask = 0;
@@ -766,53 +618,12 @@ CoalesceEngine::processNextWriteBackEvent(int block_index)
         cacheBlocks[block_index].dirty = false;
         DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
                 "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-        // fillQueue.push_back(block_index);
-        // assert(fillQueue.size() <= numLines);
-        memoryFunctionQueue.emplace_back([this] (int block_index) { processNextMemoryReadEvent(block_index); }, block_index);
-        // if ((!nextMemoryReadEvent.pending()) &&
-        //     (!nextMemoryReadEvent.scheduled())){
-        //     schedule(nextMemoryReadEvent, nextCycle());
-        // }
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    }
-
-    // writeBackQueue.pop_front();
-    // assert(writeBackQueue.size() <= numLines);
-    // DPRINTF(CoalesceEngine, "%s: Popped %d from writeBackQueue. "
-    //                 "writeBackQueue.size = %d, writeBackQueueSize = %d.\n",
-    //                 __func__, block_index, writeBackQueue.size(), numLines);
-
-    // if (!writeBackQueue.empty()) {
-        // assert(!nextWriteBackEvent.pending());
-        // assert(!nextWriteBackEvent.scheduled());
-        // schedule(nextWriteBackEvent, nextCycle());
-        // memoryFunctionQueue.push_back([this] { processNextWriteBackEvent(); });
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
-    // }
-}
 
-void
-CoalesceEngine::recvPushRetry()
-{
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    // assert(!nextRecvPushRetryEvent.pending());
-    // assert(!nextRecvPushRetryEvent.scheduled());
-    assert(numRetriesReceived == 1);
-    // schedule(nextRecvPushRetryEvent, nextCycle());
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+        memoryFunctionQueue.emplace_back([this] (int block_index) {
+                processNextRead(block_index);
+            }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
+                            "memoryFunctionQueue.\n", __func__, block_index);
     }
 }
 
@@ -866,7 +677,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int slice_base_2)
 {
     bool hit_in_cache;
     int slice_base;
@@ -879,14 +690,6 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetry passing "
-            //                             "its MemRetry.\n", __func__);
-            //     recvMemRetry();
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             int push_needed = 0;
             DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
@@ -916,39 +719,15 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
                     deschedule(nextApplyEvent);
                 }
                 if (cacheBlocks[block_index].hasConflict) {
-                    // writeBackQueue.push_back(block_index);
-                    // assert(writeBackQueue.size() <= numLines);
-                    // if ((!nextWriteBackEvent.pending()) &&
-                    //     (!nextWriteBackEvent.scheduled())) {
-                    //     schedule(nextWriteBackEvent, nextCycle());
-                    // }
-                    memoryFunctionQueue.emplace_back([this] (int block_index) { processNextWriteBackEvent(block_index); }, block_index);
-                    // if ((!nextMemoryEvent.pending()) &&
-                    //     (!nextMemoryEvent.scheduled())) {
-                    //     schedule(nextMemoryEvent, nextCycle());
-                    // }
+                    memoryFunctionQueue.emplace_back([this] (int block_index) {
+                        processNextWriteBack(block_index);
+                    }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
+                                        " input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
                 }
             }
         } else {
-            // if (memPort.blocked()) {
-            //     // assert(nextRecvPushRetryEvent.getPrevState() != -1);
-            //     nextRecvPushRetryEvent.setPrevState(-1);
-            //     nextRecvPushRetryEvent.sleep();
-            //     pendingEventQueue.push_back("nextRecvPushRetryEvent");
-            //     assert(pendingEventQueue.size() <= 3);
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is asleep now "
-            //                             "and has been pushed to pendingEventQueue."
-            //                             " pendingEventQueue.size = %d.\n",
-            //                             __func__, pendingEventQueue.size());
-            //     return;
-            // }
-            // if nextRecvPushRetryEvent has been blocked by memory before
-            // if (nextRecvPushRetryEvent.getPrevState() == -1) {
-            //     DPRINTF(CoalesceEngine, "%s: nextRecvPushRetryEvent is "
-            //         "unblocked by memPort. Setting prevState to 0.\n", __func__);
-            //     nextRecvPushRetryEvent.setPrevState(0);
-            // }
-
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
@@ -961,18 +740,53 @@ CoalesceEngine::processNextRecvPushRetryEvent(int slice_base_2)
         numRetriesReceived--;
         assert(numRetriesReceived == 0);
     }
-    // if (numRetriesReceived > 0) {
-    //     schedule(nextRecvPushRetryEvent, nextCycle());
-    // }
+
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) { processNextRecvPushRetryEvent(slice_base); }, 0);
-        // if ((!nextMemoryEvent.pending()) &&
-        //     (!nextMemoryEvent.scheduled())) {
-        //     schedule(nextMemoryEvent, nextCycle());
-        // }
+        memoryFunctionQueue.emplace_back([this] (int slice_base) {
+            processNextPushRetry(slice_base);
+        }, 0);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvPushRetry()
+{
+    numRetriesReceived++;
+    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
+    // For now since we do only one retry at a time, we should not receive
+    // a retry while this nextSendingRetryEvent is scheduled or is pending.
+    assert(numRetriesReceived == 1);
+
+    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    memoryFunctionQueue.emplace_back([this] (int slice_base) {
+        processNextPushRetry(slice_base);
+    }, 0);
+    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
+                                        "memoryFunctionQueue.\n", __func__);
+    if ((!nextMemoryEvent.pending()) &&
+        (!nextMemoryEvent.scheduled())) {
+        schedule(nextMemoryEvent, nextCycle());
     }
 }
 
+
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4036dc49af..7db09cec11 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -77,58 +77,40 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
 
-    // int nmpu;
-    // Addr memoryAddressOffset;
-
     WLEngine* peerWLEngine;
     PushEngine* peerPushEngine;
 
-    Block* cacheBlocks;
-
     int numLines;
     int numElementsPerLine;
+    Block* cacheBlocks;
 
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-
-    // std::deque<int> fillQueue;
-
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
     int numRetriesReceived;
-    InOutSet<int> applyQueue;
+    UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
-    // InOutSet<int> writeBackQueue;
-
-
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
-    // std::deque<std::string> pendingEventQueue;
-
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
+    void processNextRead(int block_index);
+    void processNextWriteBack(int block_index);
+    void processNextPushRetry(int slice_base);
+    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
 
-    // MemoryEvent nextMemoryReadEvent;
-    void processNextMemoryReadEvent(int block_index);
-
-    EventFunctionWrapper nextRespondEvent;
-    void processNextRespondEvent();
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
 
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
-    // MemoryEvent nextWriteBackEvent;
-    void processNextWriteBackEvent(int block_index);
-
-    // MemoryEvent nextRecvPushRetryEvent;
-    void processNextRecvPushRetryEvent(int slice_base);
-
     struct CoalesceStats : public statistics::Group
     {
       CoalesceStats(CoalesceEngine &coalesce);
@@ -164,8 +146,6 @@ class CoalesceEngine : public BaseMemoryEngine
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();
-
-    // virtual void startup() override;
 };
 
 }

From 08ca0a193d0d22ef85cf5a95691a0317ff14c276 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 29 Jul 2022 16:59:30 -0700
Subject: [PATCH 137/287] Fixing cache block state machine. wip.

---
 src/accl/graph/sega/SConscript         |   1 +
 src/accl/graph/sega/coalesce_engine.cc | 385 ++++++++++++++++++++++---
 src/accl/graph/sega/coalesce_engine.hh |  31 +-
 src/accl/graph/sega/state_machine.md   |   1 +
 4 files changed, 368 insertions(+), 50 deletions(-)
 create mode 100644 src/accl/graph/sega/state_machine.md

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 97a62d44a0..81a29df6af 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,5 +43,6 @@ DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
 DebugFlag('CenteralController')
 DebugFlag('CoalesceEngine')
+DebugFlag('CacheBlockState')
 DebugFlag('PushEngine')
 DebugFlag('WLEngine')
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6ed94fe938..a0c85de2f5 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/CacheBlockState.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -104,11 +105,180 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
+// TODO: Prev implementaton of recvWLRead. Remove
+// bool
+// CoalesceEngine::recvWLRead(Addr addr)
+// {
+//     assert(MSHR.size() <= numMSHREntries);
+
+//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+//     assert(aligned_addr % peerMemoryAtomSize == 0);
+//     int block_index = getBlockIndex(aligned_addr);
+//     assert(block_index < numLines);
+//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+//     assert(wl_offset < numElementsPerLine);
+//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+//                         "This request maps to cacheBlocks[%d], aligned_addr: "
+//                         "%lu, and wl_offset: %d.\n", __func__, addr,
+//                         block_index, aligned_addr, wl_offset);
+
+//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
+//         (cacheBlocks[block_index].valid)) {
+//         assert(cacheBlocks[block_index].allocated);
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+//         // Hit
+//         // TODO: Add a hit latency as a param for this object.
+//         // Can't just schedule the nextResponseEvent for latency cycles in
+//         // the future.
+//         responseQueue.push_back(std::make_tuple(addr,
+//                     cacheBlocks[block_index].items[wl_offset]));
+//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+//                         "to responseQueue. responseQueue.size = %d, "
+//                         "responseQueueSize = %d.\n", __func__, addr,
+//                         cacheBlocks[block_index].items[wl_offset].to_string(),
+//                         responseQueue.size(),
+//                         peerWLEngine->getRegisterFileSize());
+//         // TODO: Stat to count the number of WLItems that have been touched.
+//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+//         stats.readHits++;
+
+//         if (!nextResponseEvent.scheduled()) {
+//             schedule(nextResponseEvent, nextCycle());
+//         }
+//         stats.numVertexReads++;
+//         return true;
+//     } else {
+//         // miss
+//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+//         if (MSHR.find(block_index) == MSHR.end()) {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
+//             assert(MSHR.size() <= numMSHREntries);
+//             if (MSHR.size() == numMSHREntries) {
+//                 // Out of MSHR entries
+//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+//                                 "Rejecting request.\n", __func__);
+//                 // TODO: Break out read rejections into more than one stat
+//                 // based on the cause of the rejection
+//                 stats.readRejections++;
+//                 return false;
+//             } else {
+//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
+//                     "entries available.\n", __func__);
+//                 if (cacheBlocks[block_index].allocated) {
+//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                                 "with Addr: %lu.\n", __func__, addr,
+//                                 cacheBlocks[block_index].addr);
+//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                                     "cacheBlocks[%d]. Rejecting request.\n",
+//                                     __func__, block_index);
+//                         stats.readRejections++;
+//                         return false;
+//                     }
+//                     cacheBlocks[block_index].hasConflict = true;
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     if ((cacheBlocks[block_index].busyMask == 0) &&
+//                         (cacheBlocks[block_index].valid)) {
+//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
+//                                             "busy. It %s in the applyQueue.\n",
+//                                             __func__, block_index,
+//                             applyQueue.find(block_index) ? "is" : "is not");
+//                         if (!applyQueue.find(block_index)) {
+//                             applyQueue.push_back(block_index);
+//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
+//                                         "applyQueue. applyQueue.size = %u.\n",
+//                                     __func__, block_index, applyQueue.size());
+//                         }
+//                         assert(!applyQueue.empty());
+//                         if ((!nextApplyEvent.scheduled())) {
+//                             schedule(nextApplyEvent, nextCycle());
+//                         }
+//                     }
+//                     return true;
+//                 } else {
+//                     assert(!cacheBlocks[block_index].valid);
+//                     assert(MSHR[block_index].size() == 0);
+//                     // MSHR available and no conflict
+//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+//                                             "Allocating a cache line for it.\n"
+//                                                             , __func__, addr);
+
+//                     cacheBlocks[block_index].addr = aligned_addr;
+//                     cacheBlocks[block_index].busyMask = 0;
+//                     cacheBlocks[block_index].allocated = true;
+//                     cacheBlocks[block_index].valid = false;
+//                     cacheBlocks[block_index].hasConflict = false;
+//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+//                                 " Addr: %lu.\n", __func__, block_index, addr);
+//                     MSHR[block_index].push_back(addr);
+//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
+//                     memoryFunctionQueue.emplace_back(
+//                         [this] (int block_index) {
+//                             processNextRead(block_index);
+//                         }, block_index);
+//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+//                                         "input %d to memoryFunctionQueue.\n",
+//                                                     __func__, block_index);
+//                     if ((!nextMemoryEvent.pending()) &&
+//                         (!nextMemoryEvent.scheduled())) {
+//                         schedule(nextMemoryEvent, nextCycle());
+//                     }
+//                     stats.readMisses++;
+//                     stats.numVertexReads++;
+//                     return true;
+//                 }
+//             }
+//         } else {
+//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
+//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+//                             "cacheBlocks[%d]. Rejecting request.\n",
+//                                             __func__, block_index);
+//                 stats.readRejections++;
+//                 return false;
+//             }
+//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
+//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+//                             "with Addr: %lu.\n", __func__, addr,
+//                             cacheBlocks[block_index].addr);
+//                 cacheBlocks[block_index].hasConflict = true;
+//             } else {
+//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
+//                             "for cacheBlocks[%d].\n", __func__, block_index);
+//             }
+
+//             if (aligned_addr != cacheBlocks[block_index].addr) {
+//                 stats.readMisses++;
+//             } else {
+//                 stats.readHitUnderMisses++;
+//             }
+
+//             MSHR[block_index].push_back(addr);
+//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
+//             stats.numVertexReads++;
+//             return true;
+//         }
+//     }
+// }
+
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
-    assert(MSHR.size() <= numMSHREntries);
-
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
     assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = getBlockIndex(aligned_addr);
@@ -119,11 +289,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "This request maps to cacheBlocks[%d], aligned_addr: "
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
-        assert(cacheBlocks[block_index].allocated);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
         // Hit
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
@@ -144,20 +321,60 @@ CoalesceEngine::recvWLRead(Addr addr)
                         peerWLEngine->getRegisterFileSize());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        stats.readHits++;
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
 
         if (!nextResponseEvent.scheduled()) {
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
         return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.readRejections++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        return true;
     } else {
         // miss
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            assert(MSHR.size() <= numMSHREntries);
             if (MSHR.size() == numMSHREntries) {
                 // Out of MSHR entries
                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
@@ -169,11 +386,12 @@ CoalesceEngine::recvWLRead(Addr addr)
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
                     "entries available.\n", __func__);
-                if (cacheBlocks[block_index].allocated) {
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
+                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                                     "cacheBlocks[%d]. Rejecting request.\n",
@@ -181,43 +399,116 @@ CoalesceEngine::recvWLRead(Addr addr)
                         stats.readRejections++;
                         return false;
                     }
-                    cacheBlocks[block_index].hasConflict = true;
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextWriteBack(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = true;
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index) {
+                                    processNextRead(block_index);
+                                }, block_index);
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
+                    // TODO: Add readConflicts here.
                     stats.numVertexReads++;
-                    if ((cacheBlocks[block_index].busyMask == 0) &&
-                        (cacheBlocks[block_index].valid)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-                                            "busy. It %s in the applyQueue.\n",
-                                            __func__, block_index,
-                            applyQueue.find(block_index) ? "is" : "is not");
-                        if (!applyQueue.find(block_index)) {
-                            applyQueue.push_back(block_index);
-                            DPRINTF(CoalesceEngine,  "%s: Added %d to "
-                                        "applyQueue. applyQueue.size = %u.\n",
-                                    __func__, block_index, applyQueue.size());
-                        }
-                        assert(!applyQueue.empty());
-                        if ((!nextApplyEvent.scheduled())) {
-                            schedule(nextApplyEvent, nextCycle());
-                        }
-                    }
                     return true;
                 } else {
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(MSHR[block_index].size() == 0);
                     // MSHR available and no conflict
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
                                             "Allocating a cache line for it.\n"
                                                             , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
 
                     cacheBlocks[block_index].addr = aligned_addr;
                     cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].allocated = true;
                     cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    // cacheBlocks[block_index].allocated = true;
+                    // cacheBlocks[block_index].hasConflict = false;
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
@@ -234,6 +525,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
                     }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
                     stats.readMisses++;
                     stats.numVertexReads++;
                     return true;
@@ -241,7 +535,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
         } else {
             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
             if (MSHR[block_index].size() == numTgtsPerMSHR) {
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
@@ -249,21 +547,12 @@ CoalesceEngine::recvWLRead(Addr addr)
                 stats.readRejections++;
                 return false;
             }
-            if ((aligned_addr != cacheBlocks[block_index].addr)) {
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                            "with Addr: %lu.\n", __func__, addr,
-                            cacheBlocks[block_index].addr);
-                cacheBlocks[block_index].hasConflict = true;
-            } else {
-                DPRINTF(CoalesceEngine, "%s: There is room for another target "
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
-            }
 
-            if (aligned_addr != cacheBlocks[block_index].addr) {
-                stats.readMisses++;
-            } else {
-                stats.readHitUnderMisses++;
-            }
+            // cacheBlocks[block_index].hasConflict = true;
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
@@ -324,8 +613,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     }
 
     if (cacheBlocks[block_index].addr == addr) {
-        assert(cacheBlocks[block_index].allocated);
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                "fill cacheBlocks[%d].\n", __func__, block_index);
         assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
         pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
                                                 peerMemoryAtomSize);
@@ -335,6 +631,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].pendingData = false;
         delete pkt;
     }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7db09cec11..e7655a069e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -34,6 +34,7 @@
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/push_engine.hh"
+#include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
@@ -51,24 +52,42 @@ class CoalesceEngine : public BaseMemoryEngine
     {
         WorkListItem* items;
         Addr addr;
-        uint8_t busyMask;
-        bool allocated;
+        uint64_t busyMask;
         bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+
+        bool allocated;
         bool hasConflict;
-        bool dirty;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
           addr(0),
           busyMask(0),
-          allocated(false),
           valid(false),
-          hasConflict(false),
-          dirty(false)
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          allocated(false),
+          hasConflict(false)
         {
           items = new WorkListItem [num_elements];
         }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
+                valid ? "true" : "false", needsApply ? "true" : "false",
+                needsWB ? "true" : "false", pendingData ? "true" : "false",
+                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+        }
     };
 
     struct SenderState : public Packet::SenderState
diff --git a/src/accl/graph/sega/state_machine.md b/src/accl/graph/sega/state_machine.md
new file mode 100644
index 0000000000..203c47cf02
--- /dev/null
+++ b/src/accl/graph/sega/state_machine.md
@@ -0,0 +1 @@
+# CoalesceEngine Block state machine
\ No newline at end of file

From 2b2b27ce86cd7c6d692af11e3f3f42b712c4d31b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 30 Jul 2022 23:14:08 -0700
Subject: [PATCH 138/287] Fixing cache block state machine. cont. wip

---
 src/accl/graph/sega/coalesce_engine.cc | 288 +++++++++----------------
 1 file changed, 98 insertions(+), 190 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a0c85de2f5..8f33a2d893 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -105,177 +105,6 @@ CoalesceEngine::getBlockAddrFromBitIndex(int index)
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
-// TODO: Prev implementaton of recvWLRead. Remove
-// bool
-// CoalesceEngine::recvWLRead(Addr addr)
-// {
-//     assert(MSHR.size() <= numMSHREntries);
-
-//     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-//     assert(aligned_addr % peerMemoryAtomSize == 0);
-//     int block_index = getBlockIndex(aligned_addr);
-//     assert(block_index < numLines);
-//     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-//     assert(wl_offset < numElementsPerLine);
-//     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-//                         "This request maps to cacheBlocks[%d], aligned_addr: "
-//                         "%lu, and wl_offset: %d.\n", __func__, addr,
-//                         block_index, aligned_addr, wl_offset);
-
-//     if ((cacheBlocks[block_index].addr == aligned_addr) &&
-//         (cacheBlocks[block_index].valid)) {
-//         assert(cacheBlocks[block_index].allocated);
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-//         // Hit
-//         // TODO: Add a hit latency as a param for this object.
-//         // Can't just schedule the nextResponseEvent for latency cycles in
-//         // the future.
-//         responseQueue.push_back(std::make_tuple(addr,
-//                     cacheBlocks[block_index].items[wl_offset]));
-//         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-//                         "to responseQueue. responseQueue.size = %d, "
-//                         "responseQueueSize = %d.\n", __func__, addr,
-//                         cacheBlocks[block_index].items[wl_offset].to_string(),
-//                         responseQueue.size(),
-//                         peerWLEngine->getRegisterFileSize());
-//         // TODO: Stat to count the number of WLItems that have been touched.
-//         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-//         stats.readHits++;
-
-//         if (!nextResponseEvent.scheduled()) {
-//             schedule(nextResponseEvent, nextCycle());
-//         }
-//         stats.numVertexReads++;
-//         return true;
-//     } else {
-//         // miss
-//         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-//         if (MSHR.find(block_index) == MSHR.end()) {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-//                     " %lu not found in MSHRs.\n", __func__, block_index, addr);
-//             assert(MSHR.size() <= numMSHREntries);
-//             if (MSHR.size() == numMSHREntries) {
-//                 // Out of MSHR entries
-//                 DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-//                                 "Rejecting request.\n", __func__);
-//                 // TODO: Break out read rejections into more than one stat
-//                 // based on the cause of the rejection
-//                 stats.readRejections++;
-//                 return false;
-//             } else {
-//                 DPRINTF(CoalesceEngine,  "%s: MSHR "
-//                     "entries available.\n", __func__);
-//                 if (cacheBlocks[block_index].allocated) {
-//                     assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                                 "with Addr: %lu.\n", __func__, addr,
-//                                 cacheBlocks[block_index].addr);
-//                     if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                         DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                                     "cacheBlocks[%d]. Rejecting request.\n",
-//                                     __func__, block_index);
-//                         stats.readRejections++;
-//                         return false;
-//                     }
-//                     cacheBlocks[block_index].hasConflict = true;
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     if ((cacheBlocks[block_index].busyMask == 0) &&
-//                         (cacheBlocks[block_index].valid)) {
-//                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not "
-//                                             "busy. It %s in the applyQueue.\n",
-//                                             __func__, block_index,
-//                             applyQueue.find(block_index) ? "is" : "is not");
-//                         if (!applyQueue.find(block_index)) {
-//                             applyQueue.push_back(block_index);
-//                             DPRINTF(CoalesceEngine,  "%s: Added %d to "
-//                                         "applyQueue. applyQueue.size = %u.\n",
-//                                     __func__, block_index, applyQueue.size());
-//                         }
-//                         assert(!applyQueue.empty());
-//                         if ((!nextApplyEvent.scheduled())) {
-//                             schedule(nextApplyEvent, nextCycle());
-//                         }
-//                     }
-//                     return true;
-//                 } else {
-//                     assert(!cacheBlocks[block_index].valid);
-//                     assert(MSHR[block_index].size() == 0);
-//                     // MSHR available and no conflict
-//                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-//                                             "Allocating a cache line for it.\n"
-//                                                             , __func__, addr);
-
-//                     cacheBlocks[block_index].addr = aligned_addr;
-//                     cacheBlocks[block_index].busyMask = 0;
-//                     cacheBlocks[block_index].allocated = true;
-//                     cacheBlocks[block_index].valid = false;
-//                     cacheBlocks[block_index].hasConflict = false;
-//                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-//                                 " Addr: %lu.\n", __func__, block_index, addr);
-//                     MSHR[block_index].push_back(addr);
-//                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-//                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
-//                     memoryFunctionQueue.emplace_back(
-//                         [this] (int block_index) {
-//                             processNextRead(block_index);
-//                         }, block_index);
-//                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-//                                         "input %d to memoryFunctionQueue.\n",
-//                                                     __func__, block_index);
-//                     if ((!nextMemoryEvent.pending()) &&
-//                         (!nextMemoryEvent.scheduled())) {
-//                         schedule(nextMemoryEvent, nextCycle());
-//                     }
-//                     stats.readMisses++;
-//                     stats.numVertexReads++;
-//                     return true;
-//                 }
-//             }
-//         } else {
-//             DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-//                 "Addr: %lu already in MSHRs.\n", __func__, block_index, addr);
-//             if (MSHR[block_index].size() == numTgtsPerMSHR) {
-//                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-//                             "cacheBlocks[%d]. Rejecting request.\n",
-//                                             __func__, block_index);
-//                 stats.readRejections++;
-//                 return false;
-//             }
-//             if ((aligned_addr != cacheBlocks[block_index].addr)) {
-//                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-//                             "with Addr: %lu.\n", __func__, addr,
-//                             cacheBlocks[block_index].addr);
-//                 cacheBlocks[block_index].hasConflict = true;
-//             } else {
-//                 DPRINTF(CoalesceEngine, "%s: There is room for another target "
-//                             "for cacheBlocks[%d].\n", __func__, block_index);
-//             }
-
-//             if (aligned_addr != cacheBlocks[block_index].addr) {
-//                 stats.readMisses++;
-//             } else {
-//                 stats.readHitUnderMisses++;
-//             }
-
-//             MSHR[block_index].push_back(addr);
-//             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-//                             "cacheBlocks[%d].\n", __func__, addr, block_index);
-//             stats.numVertexReads++;
-//             return true;
-//         }
-//     }
-// }
-
 bool
 CoalesceEngine::recvWLRead(Addr addr)
 {
@@ -615,6 +444,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                 "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].needsWB);
@@ -632,6 +463,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
         delete pkt;
     }
 
@@ -639,7 +472,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     std::vector<int> servicedIndices;
     for (int i = 0; i < MSHR[block_index].size(); i++) {
         Addr miss_addr = MSHR[block_index][i];
-        Addr aligned_miss_addr = roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        Addr aligned_miss_addr =
+            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -662,6 +496,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
             // End of the said block
             servicedIndices.push_back(i);
             // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
@@ -677,15 +513,13 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         MSHR[block_index].erase(MSHR[block_index].begin() +
                                     servicedIndices[i] - bias);
         bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced and is removed.\n",
-                    __func__, print_addr);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
+                        "and is removed.\n", __func__, print_addr);
     }
 
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        cacheBlocks[block_index].hasConflict = false;
-    } else {
-        assert(cacheBlocks[block_index].hasConflict);
+        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -726,37 +560,111 @@ CoalesceEngine::processNextResponseEvent()
 void
 CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 {
-    // TODO: Parameterize all the numbers here.
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    // int block_index = (aligned_addr / peerMemoryAtomSize) % numLines;
     int block_index = getBlockIndex(aligned_addr);
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s with Addr: %lu.\n",
-                __func__, wl.to_string(), addr);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, wl.to_string(),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
-        cacheBlocks[block_index].dirty = true;
+        cacheBlocks[block_index].items[wl_offset] = wl;
+        cacheBlocks[block_index].needsApply |= true;
+        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
+        // set that bit.
         stats.numVertexWrites++;
     }
 
-    cacheBlocks[block_index].items[wl_offset] = wl;
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
 
     // TODO: Make this more general and programmable.
     if ((cacheBlocks[block_index].busyMask == 0)) {
-        DPRINTF(CoalesceEngine,  "%s: Received all the expected writes for cacheBlocks[%d]."
-                    " It does not have any taken items anymore.\n",
-                    __func__, block_index);
-        applyQueue.push_back(block_index);
-        DPRINTF(CoalesceEngine,  "%s: Added %d to applyQueue. applyQueue.size = %u.\n",
-                __func__, block_index, applyQueue.size());
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextWriteBack(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index) {
+                            processNextRead(block_index);
+                        }, block_index);
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
     }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
 
     if ((!applyQueue.empty()) &&
         (!nextApplyEvent.scheduled())) {

From f138726a23ee6395f6c8f55a278677690cb57c83 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 31 Jul 2022 14:32:04 -0700
Subject: [PATCH 139/287] Completed cache block state machine. Needs rework of
 push interface.

---
 src/accl/graph/sega/coalesce_engine.cc | 205 +++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |   7 +-
 2 files changed, 109 insertions(+), 103 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8f33a2d893..904889f12b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -198,7 +198,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        assert(cacheBlocks[block_index].addr != aligned_addr);
+        // FIXME: Kake this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        // assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
@@ -220,14 +224,6 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
                                 "with Addr: %lu.\n", __func__, addr,
                                 cacheBlocks[block_index].addr);
-                    assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-                    if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                        DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                                    "cacheBlocks[%d]. Rejecting request.\n",
-                                    __func__, block_index);
-                        stats.readRejections++;
-                        return false;
-                    }
                     if ((cacheBlocks[block_index].valid) &&
                         (cacheBlocks[block_index].busyMask == 0) &&
                         (!cacheBlocks[block_index].pendingApply) &&
@@ -288,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].needsApply = false;
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].pendingWB = false;
                             memoryFunctionQueue.emplace_back(
                                 [this] (int block_index) {
                                     processNextRead(block_index);
@@ -323,7 +319,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     assert(cacheBlocks[block_index].busyMask == 0);
                     assert(!cacheBlocks[block_index].needsWB);
                     assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[blokc_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingData);
                     assert(!cacheBlocks[block_index].pendingApply);
                     assert(!cacheBlocks[block_index].pendingWB);
                     assert(MSHR[block_index].size() == 0);
@@ -607,6 +603,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextApplyEvent.scheduled())) {
+                schedule(nextApplyEvent, nextCycle());
+            }
         } else {
             assert(MSHR.size() <= numMSHREntries);
             // cache line has conflict.
@@ -666,70 +666,71 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
 
-    if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
-    }
-
 }
 
 void
 CoalesceEngine::processNextApplyEvent()
 {
     int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
 
-    if (cacheBlocks[block_index].busyMask != 0) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                    "apply process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        stats.falseApplySchedules++;
-    } else if (!cacheBlocks[block_index].dirty) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has no change. "
-                    "Therefore, no apply needed.\n", __func__, block_index);
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] could be applied.\n",
-                                                    __func__, block_index);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            uint32_t old_prop = cacheBlocks[block_index].items[i].prop;
-            uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-
-            if (new_prop != old_prop) {
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu]: %s.\n", __func__,
-                    cacheBlocks[block_index].addr + (i  * sizeof(WorkListItem)),
-                    cacheBlocks[block_index].items[i].to_string());
-                int bit_index =
-                        getBitIndexBase(cacheBlocks[block_index].addr) + i;
-                if ((cacheBlocks[block_index].items[i].degree != 0) &&
-                    (needsPush[bit_index] == 0)) {
-                    // If the respective bit in the bit vector is set
-                    // there is no need to try and resend it.
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
+            uint32_t new_prop = std::min(current_prop,
+                            cacheBlocks[block_index].items[index].tempProp);
+            if (new_prop != current_prop) {
+                cacheBlocks[block_index].items[index].tempProp = new_prop;
+                cacheBlocks[block_index].items[index].prop = new_prop;
+                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
+                    __func__, cacheBlocks[block_index].addr, index,
+                    cacheBlocks[block_index].items[index].to_string());
+
+                int bit_index_base =
+                            getBitIndexBase(cacheBlocks[block_index].addr);
+                if ((needsPush[bit_index_base + index] == 0) &&
+                    (cacheBlocks[block_index].items[index].degree != 0)) {
                     if (peerPushEngine->allocatePushSpace()) {
                         peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[i]);
+                            cacheBlocks[block_index].items[index]);
                     } else {
-                        needsPush[bit_index] = 1;
+                        needsPush[bit_index_base + index] = 1;
                     }
                 }
             }
         }
-    }
+        cacheBlocks[block_index].needsWB = true;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
 
-    // TODO: This is where eviction policy goes
-    if ((cacheBlocks[block_index].hasConflict) &&
-        (cacheBlocks[block_index].busyMask == 0)) {
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
+        assert(MSHR.size() < numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            memoryFunctionQueue.emplace_back([this] (int block_index) {
                 processNextWriteBack(block_index);
             }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input %d "
-                        "to memoryFunctionQueue.\n", __func__, block_index);
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
         }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 
     applyQueue.pop_front();
@@ -770,6 +771,17 @@ CoalesceEngine::processNextMemoryEvent()
 void
 CoalesceEngine::processNextRead(int block_index)
 {
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
     PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
@@ -781,54 +793,53 @@ CoalesceEngine::processNextRead(int block_index)
 void
 CoalesceEngine::processNextWriteBack(int block_index)
 {
-    // Why would we write it back if it does not have a conflict?
-    assert(cacheBlocks[block_index].hasConflict);
-
-    if ((cacheBlocks[block_index].busyMask != 0) ||
-        (applyQueue.find(block_index))) {
-        DPRINTF(CoalesceEngine,  "%s: cacheBlocks[%d] has been taken amid "
-                "writeback process. Therefore, ignoring the apply schedule.\n",
-                    __func__, block_index);
-        // FIXME: Fix the name of this stat.
-        stats.falseEvictSchedules++;
-    } else {
-        if (cacheBlocks[block_index].dirty) {
-            DPRINTF(CoalesceEngine,  "%s: Change observed on "
-                    "cacheBlocks[%d].\n", __func__, block_index);
-            PacketPtr write_pkt = createWritePacket(
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+
+    // Why would we write it back if it does not have a conflict.
+    assert(MSHR.size() <= numMSHREntries);
+    assert(MSHR.find(block_index) != MSHR.end());
+    if (cacheBlocks[block_index].pendingWB) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsApply);
+        PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
-            DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
-                        write_pkt->getAddr(), write_pkt->getSize());
-            memPort.sendPacket(write_pkt);
-        } else {
-            DPRINTF(CoalesceEngine, "%s: No change observed on "
-                            "cacheBlocks[%d]. No write back needed.\n",
-                                            __func__, block_index);
-        }
-        assert(!MSHR[block_index].empty());
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
         Addr miss_addr = MSHR[block_index].front();
-        DPRINTF(CoalesceEngine,  "%s: First conflicting address for "
-                                    "cacheBlocks[%d] is Addr: %lu.\n",
-                                    __func__, block_index, miss_addr);
         Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
 
         cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].allocated = true;
         cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].hasConflict = true;
-        cacheBlocks[block_index].dirty = false;
-        DPRINTF(CoalesceEngine,  "%s: Allocated cacheBlocks[%d] for "
-                "Addr: %lu.\n", __func__, block_index, aligned_miss_addr);
-
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
         memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextRead(block_index);
-            }, block_index);
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input %d to "
-                            "memoryFunctionQueue.\n", __func__, block_index);
+            processNextRead(block_index);
+        }, block_index);
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
     }
 }
 
@@ -866,7 +877,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             // }
             return std::make_tuple(true, it);
         } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].allocated))) {
+                    (cacheBlocks[block_index].pendingData))) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e7655a069e..2ba0b62aaf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,9 +59,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
-
-        bool allocated;
-        bool hasConflict;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -73,9 +70,7 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false),
-          allocated(false),
-          hasConflict(false)
+          pendingWB(false)
         {
           items = new WorkListItem [num_elements];
         }

From 4138a240b59a7d1da2370ff87d2848787a85ec09 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 2 Aug 2022 22:33:54 -0700
Subject: [PATCH 140/287] Fixing scheduling error of memory functions.

---
 src/accl/graph/SConscript              |  32 -----
 src/accl/graph/base/data_structs.hh    |   2 +-
 src/accl/graph/sega/SConscript         |   9 +-
 src/accl/graph/sega/coalesce_engine.cc | 176 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |  24 ++--
 5 files changed, 120 insertions(+), 123 deletions(-)
 delete mode 100644 src/accl/graph/SConscript

diff --git a/src/accl/graph/SConscript b/src/accl/graph/SConscript
deleted file mode 100644
index 5dffd1a396..0000000000
--- a/src/accl/graph/SConscript
+++ /dev/null
@@ -1,32 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2016 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-Import('*')
-
-DebugFlag('SEGAStructureSize')
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 707b57c56f..830f1ecc16 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -47,7 +47,7 @@ struct __attribute__ ((packed)) WorkListItem
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{temp_prop: %u, prop: %u, degree: %u, edgeIndex: %u}",
+        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
         tempProp, prop, degree, edgeIndex);
     }
 
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 81a29df6af..4c398b5ccd 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -39,10 +39,15 @@ Source('coalesce_engine.cc')
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
-DebugFlag('BaseMemoryEngine')
 DebugFlag('ApplyUpdates')
+DebugFlag('BaseMemoryEngine')
+DebugFlag('BitVector')
 DebugFlag('CenteralController')
-DebugFlag('CoalesceEngine')
 DebugFlag('CacheBlockState')
+DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
+DebugFlag('SEGAStructureSize')
 DebugFlag('WLEngine')
+
+CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
+                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 904889f12b..da2bc54c19 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,8 +33,9 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/CoalesceEngine.hh"
+#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 
@@ -76,6 +77,13 @@ CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
     peerWLEngine = wl_engine;
 }
 
+DrainState
+CoalesceEngine::drain()
+{
+    DPRINTF(CoalesceEngine, "%s: drain called.\n");
+    return DrainState::Drained;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -156,6 +164,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 
@@ -198,7 +207,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Kake this assert work. It will break if the cache block
+        // FIXME: Make this assert work. It will break if the cache block
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
@@ -258,10 +267,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
                             "to be written back.\n", __func__, block_index);
                             cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextWriteBack(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextWriteBack for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -274,7 +284,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                     "%s.\n", __func__, block_index,
                                     cacheBlocks[block_index].to_string());
                         } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does"
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
                                             "not need to be written back.\n",
                                                         __func__, block_index);
                             cacheBlocks[block_index].addr = aligned_addr;
@@ -285,10 +295,11 @@ CoalesceEngine::recvWLRead(Addr addr)
                             cacheBlocks[block_index].pendingData = true;
                             cacheBlocks[block_index].pendingApply = false;
                             cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
                             memoryFunctionQueue.emplace_back(
-                                [this] (int block_index) {
-                                    processNextRead(block_index);
-                                }, block_index);
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
                             DPRINTF(CoalesceEngine, "%s: Pushed "
                                         "processNextRead for input "
                                         "%d to memoryFunctionQueue.\n",
@@ -332,17 +343,16 @@ CoalesceEngine::recvWLRead(Addr addr)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
-                    // cacheBlocks[block_index].allocated = true;
-                    // cacheBlocks[block_index].hasConflict = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
                                         "input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -415,7 +425,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         int push_needed = 0;
         // No applying of the line needed.
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
@@ -427,7 +437,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             push_needed += needsPush[it + i];
             needsPush[it + i] = 0;
         }
-        DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
         peerPushEngine->deallocatePushSpace(
                                 numElementsPerLine - push_needed);
@@ -459,6 +469,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].pendingData = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         delete pkt;
@@ -492,6 +503,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         peerWLEngine->getRegisterFileSize());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             // End of the said block
@@ -590,6 +602,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
                 cacheBlocks[block_index].items[wl_offset].to_string());
@@ -600,6 +613,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if ((cacheBlocks[block_index].busyMask == 0)) {
         if (cacheBlocks[block_index].needsApply) {
             cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
             applyQueue.push_back(block_index);
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
@@ -617,10 +631,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
                                             " back.\n", __func__, block_index);
                     cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextWriteBack(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -645,10 +660,11 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     cacheBlocks[block_index].pendingData = true;
                     cacheBlocks[block_index].pendingApply = false;
                     cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
-                        [this] (int block_index) {
-                            processNextRead(block_index);
-                        }, block_index);
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
                     DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
                                     "for input %d to memoryFunctionQueue.\n",
                                                     __func__, block_index);
@@ -710,15 +726,18 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
 
         assert(MSHR.size() < numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);
             cacheBlocks[block_index].pendingWB = true;
-            memoryFunctionQueue.emplace_back([this] (int block_index) {
-                processNextWriteBack(block_index);
-            }, block_index);
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
             DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
                     " %d to memoryFunctionQueue.\n", __func__, block_index);
             if ((!nextMemoryEvent.pending()) &&
@@ -750,12 +769,14 @@ CoalesceEngine::processNextMemoryEvent()
 
     DPRINTF(CoalesceEngine, "%s: Processing another "
                         "memory function.\n", __func__);
-    std::function<void(int)> next_memory_function;
+    std::function<void(int, Tick)> next_memory_function;
     int next_memory_function_input;
+    Tick next_memory_function_tick;
     std::tie(
         next_memory_function,
-        next_memory_function_input) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input);
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
@@ -769,12 +790,16 @@ CoalesceEngine::processNextMemoryEvent()
 }
 
 void
-CoalesceEngine::processNextRead(int block_index)
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
                                             __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+    //
+
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].needsWB);
@@ -791,23 +816,25 @@ CoalesceEngine::processNextRead(int block_index)
 }
 
 void
-CoalesceEngine::processNextWriteBack(int block_index)
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-
-    // Why would we write it back if it does not have a conflict.
-    assert(MSHR.size() <= numMSHREntries);
-    assert(MSHR.find(block_index) != MSHR.end());
-    if (cacheBlocks[block_index].pendingWB) {
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
         assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -833,13 +860,21 @@ CoalesceEngine::processNextWriteBack(int block_index)
         cacheBlocks[block_index].pendingData = true;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        memoryFunctionQueue.emplace_back([this] (int block_index) {
-            processNextRead(block_index);
-        }, block_index);
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
     }
 }
 
@@ -863,9 +898,14 @@ CoalesceEngine::getOptimalBitVectorSlice()
         // current_score += current_popcount;
         Addr addr = getBlockAddrFromBitIndex(it);
         int block_index = getBlockIndex(addr);
-        if ((cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].busyMask == 0)) {
+        // Idle state: valid && !pendingApply && !pendingWB
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid) &&
+            (cacheBlocks[block_index].busyMask == 0) &&
+            (!cacheBlocks[block_index].pendingApply) &&
+            (!cacheBlocks[block_index].pendingWB)) {
+            assert(!cacheBlocks[block_index].needsApply);
+            assert(!cacheBlocks[block_index].pendingData);
             // current_score += numElementsPerLine * 2;
             // if (current_score > score) {
             //     score = current_score;
@@ -876,8 +916,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
             //     }
             // }
             return std::make_tuple(true, it);
-        } else if (!((cacheBlocks[block_index].addr == addr) &&
-                    (cacheBlocks[block_index].pendingData))) {
+        } else if (cacheBlocks[block_index].addr != addr) {
             // score += numElementsPerLine;
             // if (current_score > score) {
             //     score = current_score;
@@ -893,7 +932,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int slice_base_2)
+CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -907,17 +946,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
             assert(cacheBlocks[block_index].busyMask == 0);
 
             int push_needed = 0;
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
             assert(peerPushEngine->getNumRetries() == needsPush.count());
 
             for (int i = 0; i < numElementsPerLine; i++) {
-                // TODO: Make this more programmable
-                uint32_t new_prop = std::min(
-                                cacheBlocks[block_index].items[i].prop,
-                                cacheBlocks[block_index].items[i].tempProp);
-                cacheBlocks[block_index].items[i].tempProp = new_prop;
-                cacheBlocks[block_index].items[i].prop = new_prop;
                 if (needsPush[slice_base + i] == 1) {
                     peerPushEngine->recvWLItemRetry(
                         cacheBlocks[block_index].items[i]);
@@ -925,24 +958,11 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
                 push_needed +=  needsPush[slice_base + i];
                 needsPush[slice_base + i] = 0;
             }
-            DPRINTF(CoalesceEngine, "%s: needsPush.count: %d.\n",
+            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(numElementsPerLine - push_needed);
+            peerPushEngine->deallocatePushSpace(
+                                            numElementsPerLine - push_needed);
             assert(peerPushEngine->getNumRetries() == needsPush.count());
-            if (applyQueue.find(block_index)) {
-                applyQueue.erase(block_index);
-                if (applyQueue.empty() && nextApplyEvent.scheduled()) {
-                    deschedule(nextApplyEvent);
-                }
-                if (cacheBlocks[block_index].hasConflict) {
-                    memoryFunctionQueue.emplace_back([this] (int block_index) {
-                        processNextWriteBack(block_index);
-                    }, block_index);
-                    DPRINTF(CoalesceEngine, "%s: Pushed nextWriteBackEvent for"
-                                        " input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                }
-            }
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -958,9 +978,10 @@ CoalesceEngine::processNextPushRetry(int slice_base_2)
     }
 
     if (numRetriesReceived > 0) {
-        memoryFunctionQueue.emplace_back([this] (int slice_base) {
-            processNextPushRetry(slice_base);
-        }, 0);
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextPushRetry(slice_base, schedule_tick);
+        }, 0, curTick());
         DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
@@ -990,9 +1011,10 @@ CoalesceEngine::recvPushRetry()
     assert(numRetriesReceived == 1);
 
     // TODO: Pass slice_base to getOptimalBitVectorSlice
-    memoryFunctionQueue.emplace_back([this] (int slice_base) {
-        processNextPushRetry(slice_base);
-    }, 0);
+    memoryFunctionQueue.emplace_back(
+        [this] (int slice_base, Tick schedule_tick) {
+        processNextPushRetry(slice_base, schedule_tick);
+    }, 0, curTick());
     DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
                                         "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2ba0b62aaf..ce6e0daca6 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -59,6 +59,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingData;
         bool pendingApply;
         bool pendingWB;
+        Tick lastChangedTick;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -70,7 +71,8 @@ class CoalesceEngine : public BaseMemoryEngine
           needsWB(false),
           pendingData(false),
           pendingApply(false),
-          pendingWB(false)
+          pendingWB(false),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
@@ -78,10 +80,11 @@ class CoalesceEngine : public BaseMemoryEngine
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
                 "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s}", addr, busyMask,
-                valid ? "true" : "false", needsApply ? "true" : "false",
-                needsWB ? "true" : "false", pendingData ? "true" : "false",
-                pendingApply ? "true" : "false", pendingWB ? "true" : "false");
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
         }
     };
 
@@ -114,10 +117,10 @@ class CoalesceEngine : public BaseMemoryEngine
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
-    void processNextRead(int block_index);
-    void processNextWriteBack(int block_index);
-    void processNextPushRetry(int slice_base);
-    std::deque<std::tuple<std::function<void(int)>, int>> memoryFunctionQueue;
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextPushRetry(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
@@ -151,12 +154,11 @@ class CoalesceEngine : public BaseMemoryEngine
 
   public:
     PARAMS(CoalesceEngine);
-
     CoalesceEngine(const Params &params);
+    virtual DrainState drain() override;
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-
     void registerWLEngine(WLEngine* wl_engine);
 
     void recvPushRetry();

From 1194dc3ec83a9b78acfa4487cbd2552eed74c317 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 3 Aug 2022 12:41:28 -0700
Subject: [PATCH 141/287] Fixing incorrect assert.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index da2bc54c19..21dd746aad 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -728,7 +728,7 @@ CoalesceEngine::processNextApplyEvent()
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
 
-        assert(MSHR.size() < numMSHREntries);
+        assert(MSHR.size() <= numMSHREntries);
         if (MSHR.find(block_index) != MSHR.end()) {
             DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
                                 "conflicts.\n", __func__, block_index);

From c1d92aed296ca6827fb75047216c32efbe477b98 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 5 Aug 2022 13:37:54 -0700
Subject: [PATCH 142/287] Updating memory address mapping and interface for
 push coalesce.

---
 configs/accl/sega.py                      | 30 ++++++++++-------
 src/accl/graph/base/base_reduce_engine.cc |  2 +-
 src/accl/graph/base/base_reduce_engine.hh |  3 +-
 src/accl/graph/base/data_structs.hh       | 19 +++++++++++
 src/accl/graph/sega/PushEngine.py         |  3 +-
 src/accl/graph/sega/push_engine.cc        | 40 ++++++++++++++++-------
 src/accl/graph/sega/push_engine.hh        | 35 +++++++++++++++-----
 7 files changed, 96 insertions(+), 36 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7577331f2b..26488ef69d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -8,20 +8,23 @@
 class MPU(SubSystem):
     def __init__(self, base_edge_addr):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-                                    push_req_queue_size=2,
+        self.push_engine = PushEngine(base_edge_addr=0,
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    outstanding_mem_req_queue_size=1,
-                                    resp_queue_size=1)
+                                    resp_queue_size=64)
+        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
+        #                             push_req_queue_size=32,
+        #                             attached_memory_atom_size=64,
+        #                             resp_queue_size=64)
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="32B",
-                                    num_mshr_entry=1,
-                                    num_tgts_per_mshr=1)
+                                    cache_size="8MiB",
+                                    num_mshr_entry=32,
+                                    num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=1,
-                                on_the_fly_update_map_size=1)
+                                update_queue_size=64,
+                                register_file_size=32)
 
     def getRespPort(self):
         return self.wl_engine.resp_port
@@ -74,10 +77,15 @@ def __init__(self,
                             latency="30ns")
             )
             edge_mem_ctrl.append(
-                SimpleMemory(range=self._edge_ranges[i],
+                # SimpleMemory(range=self._edge_ranges[i],
+                #             bandwidth="4.8GB/s",
+                #             latency="30ns",
+                #             image_file=f"{graph_path}/edgelist_{i}")
+                SimpleMemory(range=AddrRange(self._edge_chunk_size),
                             bandwidth="4.8GB/s",
                             latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}")
+                            image_file=f"{graph_path}/edgelist_{i}",
+                            in_addr_map=False)
             )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
diff --git a/src/accl/graph/base/base_reduce_engine.cc b/src/accl/graph/base/base_reduce_engine.cc
index 38a8662ed0..ade95800d2 100644
--- a/src/accl/graph/base/base_reduce_engine.cc
+++ b/src/accl/graph/base/base_reduce_engine.cc
@@ -31,7 +31,7 @@
 namespace gem5
 {
 
-BaseReduceEngine::BaseReduceEngine(const BaseReduceEngineParams &params):
+BaseReduceEngine::BaseReduceEngine(const Params &params):
     ClockedObject(params),
     system(params.system),
     _requestorId(system->getRequestorId(this))
diff --git a/src/accl/graph/base/base_reduce_engine.hh b/src/accl/graph/base/base_reduce_engine.hh
index c8c9784ed1..268bb60b76 100644
--- a/src/accl/graph/base/base_reduce_engine.hh
+++ b/src/accl/graph/base/base_reduce_engine.hh
@@ -47,8 +47,7 @@ class BaseReduceEngine : public ClockedObject
 
   public:
     PARAMS(BaseReduceEngine);
-
-    BaseReduceEngine(const BaseReduceEngineParams &params);
+    BaseReduceEngine(const Params &params);
     ~BaseReduceEngine();
 
     RequestorID requestorId() { return _requestorId; }
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 830f1ecc16..6f775d8a38 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -78,15 +78,34 @@ struct __attribute__ ((packed)) Edge
         return csprintf("Edge{weight: %u, neighbor: %lu}", weight, neighbor);
     }
 
+    Edge(): weight(0), neighbor(0) {}
+
     Edge(uint16_t weight, uint64_t neighbor):
         weight(weight),
         neighbor(neighbor)
     {}
+
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
+struct CompleteEdge {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t weight;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
+        src(src), dst(dst), weight(weight)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+                                                    src, dst, weight);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 447731219e..a45f5d6ead 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -35,8 +35,7 @@ class PushEngine(BaseMemoryEngine):
     cxx_class = 'gem5::PushEngine'
 
     req_port  = RequestPort("Port to send updates to the outside")
-    base_edge_addr = Param.Addr("The base address for the "
-                                    "attached edge memory")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d87462d7dd..d071e8fd37 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -38,7 +38,6 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    baseEdgeAddr(params.base_edge_addr),
     pushReqQueueSize(params.push_req_queue_size),
     numTotalRetries(0), numPendingRetries(0),
     onTheFlyMemReqs(0),
@@ -140,12 +139,12 @@ PushEngine::recvWLItem(WorkListItem wl)
             "checking if there is enough push space. Use allocatePushSpace.\n");
 
     DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
 
@@ -162,12 +161,12 @@ PushEngine::recvWLItemRetry(WorkListItem wl)
     DPRINTF(PushEngine, "%s: Received %s with retry.\n",
                                 __func__, wl.to_string());
 
-    Addr start_addr = baseEdgeAddr + (wl.edgeIndex * sizeof(Edge));
+    Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
     uint32_t value = wl.prop;
 
     pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value);
+                                    peerMemoryAtomSize, value, 0);
     assert(pushReqQueue.size() <= pushReqQueueSize);
     DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
                             __func__, pushReqQueue.size());
@@ -191,22 +190,24 @@ PushEngine::processNextMemoryReadEvent()
         Addr aligned_addr, offset;
         int num_edges;
 
-        PushPacketInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = pushReqQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
-                    "PushPacketInfoGen. aligned_addr: %lu, offset: %lu, "
+                    "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         reqOffsetMap[pkt->req] = offset;
         reqNumEdgeMap[pkt->req] = num_edges;
         reqValueMap[pkt->req] = curr_info.value();
+        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
         onTheFlyMemReqs++;
 
         if (curr_info.done()) {
-            DPRINTF(PushEngine, "%s: Current PushPacketInfoGen is done.\n", __func__);
+            DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             pushReqQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
                         "pushReqQueue.size() = %u.\n",
@@ -228,9 +229,6 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    // if ((!nextMemoryReadEvent.scheduled()) && (!pushReqQueue.empty())) {
-        // schedule(nextMemoryReadEvent, nextCycle());
-    // }
     if (!pushReqQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
@@ -265,6 +263,20 @@ PushEngine::handleMemResp(PacketPtr pkt)
     onTheFlyMemReqs--;
     assert(memRespQueue.size() <= memRespQueueSize);
 
+    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    PushInfo push_info = reqInfoMap[pkt->req];
+    pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
+
+    std::vector<CompleteEdge> edges;
+    for (int i = 0; i < push_info.numElements; i++) {
+        Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
+        Addr edge_dst = edge->neighbor;
+        uint32_t edge_weight = edge->weight;
+        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+    }
+    edgeQueue.push_back(edges);
+    delete pkt_data;
+
     if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
         schedule(nextPushEvent, nextCycle());
     }
@@ -288,6 +300,12 @@ PushEngine::processNextPushEvent()
 
     Edge* curr_edge = (Edge*) (data + offset);
 
+    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
+    while(!current_edges.empty()) {
+        CompleteEdge curr_edge = current_edges.back();
+        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
+        current_edges.pop_back();
+    }
     // TODO: Implement propagate function here
     uint32_t update_value = value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9b182e2251..7fb6c42579 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,19 +42,21 @@ class CoalesceEngine;
 class PushEngine : public BaseMemoryEngine
 {
   private:
-    class PushPacketInfoGen {
+    class EdgeReadInfoGen {
       private:
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
+
         uint32_t _value;
+        Addr _src;
 
       public:
-        PushPacketInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _value(value)
+        EdgeReadInfoGen(Addr start, Addr end, size_t step,
+                            size_t atom, uint32_t value, Addr src):
+                            _start(start), _end(end), _step(step),
+                            _atom(atom), _value(value), _src(src)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -74,8 +76,17 @@ class PushEngine : public BaseMemoryEngine
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
-        uint32_t value() { return _value; }
         bool done() { return (_start >= _end); }
+
+        Addr src() { return _src; }
+        uint32_t value() { return _value; }
+    };
+
+    struct PushInfo {
+        Addr src;
+        uint32_t value;
+        Addr offset;
+        int numElements;
     };
 
     class ReqPort : public RequestPort
@@ -98,26 +109,27 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
     ReqPort reqPort;
 
-    Addr baseEdgeAddr;
-
     int pushReqQueueSize;
     int numTotalRetries;
     int numPendingRetries;
-    std::deque<PushPacketInfoGen> pushReqQueue;
+    std::deque<EdgeReadInfoGen> pushReqQueue;
 
     // TODO: Add size one size for all these maps
     std::unordered_map<RequestPtr, Addr> reqOffsetMap;
     std::unordered_map<RequestPtr, int> reqNumEdgeMap;
     std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
     int memRespQueueSize;
     std::deque<PacketPtr> memRespQueue;
+    std::deque<std::vector<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
@@ -167,6 +179,11 @@ class PushEngine : public BaseMemoryEngine
 
     int getNumRetries() { return numTotalRetries; }
 
+    void start(); // CoalesceEngine announcing work
+    void stop(); // CoalesceEngine announcing no work
+    bool running() { return _running; }
+    void recvWLItem2(Addr addr, WorkListItem wl);
+
 };
 
 }

From 371f2b600c6b24ad2bdcb3f434284c06b22cff04 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 12 Aug 2022 08:32:42 -0700
Subject: [PATCH 143/287] Implemented pullVertex.

---
 configs/accl/sega.py                      |   7 +-
 src/accl/graph/base/data_structs.hh       |   5 +-
 src/accl/graph/sega/SConscript            |   1 +
 src/accl/graph/sega/base_memory_engine.cc |   8 +-
 src/accl/graph/sega/coalesce_engine.cc    |  71 +++---
 src/accl/graph/sega/coalesce_engine.hh    |   6 +-
 src/accl/graph/sega/push_engine.cc        | 257 +++++++++-------------
 src/accl/graph/sega/push_engine.hh        |  52 ++---
 8 files changed, 167 insertions(+), 240 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 26488ef69d..e7a704d477 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -6,10 +6,9 @@
 from m5.util.convert import toMemorySize
 
 class MPU(SubSystem):
-    def __init__(self, base_edge_addr):
+    def __init__(self):
         super(MPU, self).__init__()
-        self.push_engine = PushEngine(base_edge_addr=0,
-                                    push_req_queue_size=32,
+        self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
         # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
@@ -151,7 +150,7 @@ def __init__(self,
 
         mpus = []
         for i in range(num_mpus):
-            mpus.append(MPU(base_edge_addr=self.mem_ctrl.getEdgeBaseAddr(i)))
+            mpus.append(MPU())
             mpus[i].setReqPort(self.interconnect.cpu_side_ports)
             mpus[i].setRespPort(self.interconnect.mem_side_ports)
             mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 6f775d8a38..026a3cb7b2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -94,9 +94,10 @@ struct CompleteEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
+    uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight):
-        src(src), dst(dst), weight(weight)
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 4c398b5ccd..ae216ccdd4 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -47,6 +47,7 @@ DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
+DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index a5d1d7e8e7..9bd1941b23 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -99,11 +99,9 @@ BaseMemoryEngine::MemPort::recvReqRetry()
             "Received retry without a blockedPacket");
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!blocked()) {
-        blockedPacket = nullptr;
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
 }
 
 PacketPtr
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 21dd746aad..dcec2a5f78 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,8 +47,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry), numTgtsPerMSHR(params.num_tgts_per_mshr),
-    numRetriesReceived(0),
+    numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -423,26 +424,20 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     "for addr %lu. It was not found in the cache.\n",
                     __func__, addr);
         WorkListItem* items = pkt->getPtr<WorkListItem>();
-        int push_needed = 0;
         // No applying of the line needed.
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
-            assert(!((needsPush[it + i] == 1) &&
-                            (items[i].degree == 0)));
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
             if (needsPush[it + i] == 1) {
-                peerPushEngine->recvWLItemRetry(items[i]);
+                _workCount--;
+                needsPush[it + i] = 0;
+                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                break;
             }
-            push_needed += needsPush[it + i];
-            needsPush[it + i] = 0;
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
-        peerPushEngine->deallocatePushSpace(
-                                numElementsPerLine - push_needed);
-        assert(peerPushEngine->getNumRetries() == needsPush.count());
-        // }
         delete pkt;
         return true;
     }
@@ -691,7 +686,7 @@ CoalesceEngine::processNextApplyEvent()
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, cacheBlocks[block_index].to_string());
+            __func__, block_index, cacheBlocks[block_index].to_string());
     assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].needsApply);
     assert(!cacheBlocks[block_index].pendingData);
@@ -712,14 +707,15 @@ CoalesceEngine::processNextApplyEvent()
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
-                if ((needsPush[bit_index_base + index] == 0) &&
-                    (cacheBlocks[block_index].items[index].degree != 0)) {
-                    if (peerPushEngine->allocatePushSpace()) {
-                        peerPushEngine->recvWLItem(
-                            cacheBlocks[block_index].items[index]);
-                    } else {
+
+                if (cacheBlocks[block_index].items[index].degree > 0) {
+                    if (needsPush[bit_index_base + index] == 0) {
+                        _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
+                    if (!peerPushEngine->running()) {
+                        peerPushEngine->start();
+                    }
                 }
             }
         }
@@ -945,24 +941,20 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             assert(cacheBlocks[block_index].valid);
             assert(cacheBlocks[block_index].busyMask == 0);
 
-            int push_needed = 0;
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
-
             for (int i = 0; i < numElementsPerLine; i++) {
+                Addr vertex_addr = addr + i * sizeof(WorkListItem);
                 if (needsPush[slice_base + i] == 1) {
-                    peerPushEngine->recvWLItemRetry(
-                        cacheBlocks[block_index].items[i]);
+                    _workCount--;
+                    needsPush[slice_base + i] = 0;
+                    peerPushEngine->recvVertexPush(vertex_addr,
+                                            cacheBlocks[block_index].items[i]);
+                    break;
                 }
-                push_needed +=  needsPush[slice_base + i];
-                needsPush[slice_base + i] = 0;
             }
             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                                     __func__, needsPush.count());
-            peerPushEngine->deallocatePushSpace(
-                                            numElementsPerLine - push_needed);
-            assert(peerPushEngine->getNumRetries() == needsPush.count());
         } else {
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
@@ -973,11 +965,10 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             // a flag to true (maybe not even needed just look if the cache has a
             // line allocated for it in the cacheBlocks).
         }
-        numRetriesReceived--;
-        assert(numRetriesReceived == 0);
+        numPullsReceived--;
     }
 
-    if (numRetriesReceived > 0) {
+    if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
             processNextPushRetry(slice_base, schedule_tick);
@@ -1002,29 +993,19 @@ CoalesceEngine::recvMemRetry()
 }
 
 void
-CoalesceEngine::recvPushRetry()
+CoalesceEngine::recvVertexPull()
 {
-    numRetriesReceived++;
-    DPRINTF(CoalesceEngine,  "%s: Received a push retry.\n", __func__);
-    // For now since we do only one retry at a time, we should not receive
-    // a retry while this nextSendingRetryEvent is scheduled or is pending.
-    assert(numRetriesReceived == 1);
-
-    // TODO: Pass slice_base to getOptimalBitVectorSlice
+    numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
         processNextPushRetry(slice_base, schedule_tick);
     }, 0, curTick());
-    DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input 0 to "
-                                        "memoryFunctionQueue.\n", __func__);
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
         schedule(nextMemoryEvent, nextCycle());
     }
 }
 
-
-
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index ce6e0daca6..6969fe2823 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,8 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<int, std::vector<Addr>> MSHR;
     std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
 
-    int numRetriesReceived;
+    int _workCount;
+    int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
@@ -161,7 +162,8 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWLWrite(Addr addr, WorkListItem wl);
     void registerWLEngine(WLEngine* wl_engine);
 
-    void recvPushRetry();
+    int workCount() { return _workCount; }
+    void recvVertexPull();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d071e8fd37..b5341b3d61 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "debug/PushEngine.hh"
+#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 
 namespace gem5
@@ -38,13 +39,12 @@ namespace gem5
 PushEngine::PushEngine(const Params &params):
     BaseMemoryEngine(params),
     reqPort(name() + ".req_port", this),
-    pushReqQueueSize(params.push_req_queue_size),
-    numTotalRetries(0), numPendingRetries(0),
-    onTheFlyMemReqs(0),
-    memRespQueueSize(params.resp_queue_size),
+    _running(false),
+    numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
+    onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
-    nextSendRetryEvent([this] { processNextSendRetryEvent(); }, name()),
     stats(*this)
 {}
 
@@ -66,15 +66,31 @@ PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
     numElementsPerLine = elements_per_line;
 }
 
+void
+PushEngine::recvReqRetry()
+{
+    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
+    if (nextPushEvent.pending()) {
+        nextPushEvent.wake();
+        schedule(nextPushEvent, nextCycle());
+    }
+}
+
 void
 PushEngine::ReqPort::sendPacket(PacketPtr pkt)
 {
     panic_if(_blocked, "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
+    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
+                "the network.\n", __func__, pkt->print());
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
         _blocked = true;
+        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
+    } else {
+        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
+        owner->recvReqRetry();
     }
 }
 
@@ -92,86 +108,73 @@ PushEngine::ReqPort::recvReqRetry()
     DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
 
     _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-        DPRINTF(PushEngine, "%s: Sent the blockedPacket. "
-                "_blocked: %s, (blockedPacket == nullptr): %s.\n",
-                __func__, _blocked ? "true" : "false",
-                (blockedPacket == nullptr) ? "true" : "false");
-    }
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+PushEngine::vertexSpace()
+{
+    return (edgePointerQueueSize == 0) ||
+        ((edgePointerQueue.size() + numPendingPulls) < edgePointerQueueSize);
+}
+
+bool
+PushEngine::workLeft()
+{
+    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
 void
-PushEngine::deallocatePushSpace(int space)
+PushEngine::start()
 {
-    /// DISCUSS: Might have to check whether the addrGenEvent is scheduled
-    // and or the pushReqQueue is empty. If so we might need to
-    // send retries.
-    DPRINTF(PushEngine, "%s: Received reported %d free spaces.\n",
-                                                __func__, space);
-    numPendingRetries--;
-    if (numTotalRetries > 0) {
-        int free_space = pushReqQueueSize -
-            (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-        DPRINTF(PushEngine, "%s: pushReqQueue has at least %d "
-                            "free spaces.\n", __func__, free_space);
-        if ((free_space >= numElementsPerLine) &&
-            (numPendingRetries == 0)) {
-            DPRINTF(PushEngine, "%s: Sent a push retry to "
-                            "peerCoalesceEngine.\n", __func__);
-            assert(!nextSendRetryEvent.scheduled());
-            schedule(nextSendRetryEvent, nextCycle());
-        }
+    assert(!_running);
+    assert(!nextVertexPullEvent.scheduled());
+
+    _running = true;
+    // NOTE: We might have to check for size availability here.
+    assert(workLeft());
+    if (vertexSpace()) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItem(WorkListItem wl)
+PushEngine::processNextVertexPullEvent()
 {
-    assert(wl.degree != 0);
-
-    assert((pushReqQueueSize == 0) ||
-        (pushReqQueue.size() < pushReqQueueSize));
-    panic_if((pushReqQueue.size() == pushReqQueueSize) &&
-            (pushReqQueueSize != 0), "You should call this method after "
-            "checking if there is enough push space. Use allocatePushSpace.\n");
+    // TODO: change edgePointerQueueSize
+    numPendingPulls++;
+    peerCoalesceEngine->recvVertexPull();
 
-    DPRINTF(PushEngine, "%s: Received %s.\n", __func__, wl.to_string());
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
-
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    if (!workLeft()) {
+        _running = false;
+    }
 
-    if ((!nextMemoryReadEvent.pending()) &&
-        (!nextMemoryReadEvent.scheduled())) {
-        schedule(nextMemoryReadEvent, nextCycle());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
     }
 }
 
 void
-PushEngine::recvWLItemRetry(WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
 {
-    assert(wl.degree != 0);
-    DPRINTF(PushEngine, "%s: Received %s with retry.\n",
-                                __func__, wl.to_string());
+    assert(wl.degree > 0);
+    assert((edgePointerQueueSize == 0) ||
+            ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
-    uint32_t value = wl.prop;
 
-    pushReqQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                                    peerMemoryAtomSize, value, 0);
-    assert(pushReqQueue.size() <= pushReqQueueSize);
-    DPRINTF(PushEngine, "%s: pushReqQueue.size() = %d.\n",
-                            __func__, pushReqQueue.size());
+    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
+                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    numPendingPulls--;
+    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
+                            __func__, addr, wl.to_string());
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 
-    numTotalRetries--;
     if ((!nextMemoryReadEvent.pending()) &&
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
@@ -186,20 +189,17 @@ PushEngine::processNextMemoryReadEvent()
         return;
     }
 
-    if (memRespQueue.size() < (memRespQueueSize - onTheFlyMemReqs)) {
+    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
         Addr aligned_addr, offset;
         int num_edges;
 
-        EdgeReadInfoGen &curr_info = pushReqQueue.front();
+        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
         std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        reqOffsetMap[pkt->req] = offset;
-        reqNumEdgeMap[pkt->req] = num_edges;
-        reqValueMap[pkt->req] = curr_info.value();
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
 
@@ -208,42 +208,23 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
-            pushReqQueue.pop_front();
-            DPRINTF(PushEngine, "%s: Popped curr_info from pushReqQueue. "
-                        "pushReqQueue.size() = %u.\n",
-                        __func__, pushReqQueue.size());
-            if (numTotalRetries > 0) {
-                int free_space = pushReqQueueSize -
-                (pushReqQueue.size() + (numPendingRetries * numElementsPerLine));
-                DPRINTF(PushEngine, "%s: pushReqQueue has at least %d"
-                            " free spaces.\n", __func__, free_space);
-                if ((free_space >= numElementsPerLine) &&
-                    (numPendingRetries == 0)) {
-                    DPRINTF(PushEngine, "%s: Sent a push retry to "
-                                "peerCoalesceEngine.\n", __func__);
-                    if (!nextSendRetryEvent.scheduled()) {
-                        schedule(nextSendRetryEvent, nextCycle());
-                    }
-                }
-            }
+            edgePointerQueue.pop_front();
+            DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
+            "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
     }
 
-    if (!pushReqQueue.empty()) {
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
+
+    if (!edgePointerQueue.empty()) {
         assert(!nextMemoryReadEvent.pending());
         assert(!nextMemoryReadEvent.scheduled());
         schedule(nextMemoryReadEvent, nextCycle());
     }
 }
 
-void
-PushEngine::processNextSendRetryEvent()
-{
-    assert(numPendingRetries == 0);
-    numPendingRetries++;
-    peerCoalesceEngine->recvPushRetry();
-}
-
 void
 PushEngine::recvMemRetry()
 {
@@ -259,25 +240,27 @@ PushEngine::handleMemResp(PacketPtr pkt)
 {
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
-    memRespQueue.push_back(pkt);
-    onTheFlyMemReqs--;
-    assert(memRespQueue.size() <= memRespQueueSize);
 
     uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::vector<CompleteEdge> edges;
+    std::deque<CompleteEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst, edge_weight);
+        edges.emplace_back(push_info.src, edge_dst,
+                    edge_weight, push_info.value);
     }
     edgeQueue.push_back(edges);
+    onTheFlyMemReqs--;
+    reqInfoMap.erase(pkt->req);
     delete pkt_data;
+    delete pkt;
 
-    if ((!nextPushEvent.scheduled()) && (!memRespQueue.empty())) {
+    if ((!nextPushEvent.pending()) &&
+        (!nextPushEvent.scheduled())) {
         schedule(nextPushEvent, nextCycle());
     }
     return true;
@@ -287,50 +270,37 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    PacketPtr pkt = memRespQueue.front();
-    uint8_t* data = pkt->getPtr<uint8_t>();
-
-    Addr offset = reqOffsetMap[pkt->req];
-    assert(offset < peerMemoryAtomSize);
-    uint32_t value = reqValueMap[pkt->req];
+    if (reqPort.blocked()) {
+        nextPushEvent.sleep();
+        return;
+    }
 
-    DPRINTF(PushEngine, "%s: Looking at the front of the queue. pkt->Addr: %lu, "
-                "offset: %lu\n",
-            __func__, pkt->getAddr(), offset);
+    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
+    CompleteEdge curr_edge = edge_list.front();
 
-    Edge* curr_edge = (Edge*) (data + offset);
+    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                    __func__, curr_edge.to_string());
 
-    std::vector<CompleteEdge>& current_edges = edgeQueue.front();
-    while(!current_edges.empty()) {
-        CompleteEdge curr_edge = current_edges.back();
-        DPRINTF(PushEngine, "%s: %s.\n", __func__, curr_edge.to_string());
-        current_edges.pop_back();
-    }
     // TODO: Implement propagate function here
-    uint32_t update_value = value + 1;
+    uint32_t update_value = curr_edge.value + 1;
     PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge->neighbor, update_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(update);
-        stats.numUpdates++;
-        DPRINTF(PushEngine, "%s: Sent a push update to addr: %lu with value: %d.\n",
-                                __func__, curr_edge->neighbor, update_value);
-        reqOffsetMap[pkt->req] = reqOffsetMap[pkt->req] + sizeof(Edge);
-        assert(reqOffsetMap[pkt->req] <= peerMemoryAtomSize);
-        reqNumEdgeMap[pkt->req]--;
-        assert(reqNumEdgeMap[pkt->req] >= 0);
-    }
+                            curr_edge.dst, update_value);
+
+    reqPort.sendPacket(update);
+    stats.numUpdates++;
+    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
+                        "with value: %d.\n", __func__, curr_edge.src,
+                        curr_edge.dst, update_value);
+
 
-    if (reqNumEdgeMap[pkt->req] == 0) {
-        reqOffsetMap.erase(pkt->req);
-        reqNumEdgeMap.erase(pkt->req);
-        reqValueMap.erase(pkt->req);
-        memRespQueue.pop_front();
-        delete pkt;
+    edge_list.pop_front();
+    if (edge_list.empty()) {
+        edgeQueue.pop_front();
     }
 
-    if (!nextPushEvent.scheduled() && !memRespQueue.empty()) {
+    assert(!nextPushEvent.pending());
+    assert(!nextPushEvent.scheduled());
+    if (!edgeQueue.empty()) {
         schedule(nextPushEvent, nextCycle());
     }
 }
@@ -354,17 +324,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
-bool
-PushEngine::allocatePushSpace() {
-    if ((pushReqQueueSize == 0) ||
-        ((pushReqQueue.size() < pushReqQueueSize) && (numTotalRetries == 0))) {
-        return true;
-    } else {
-        numTotalRetries++;
-        return false;
-    }
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 7fb6c42579..c79b0de944 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -49,14 +49,14 @@ class PushEngine : public BaseMemoryEngine
         size_t _step;
         size_t _atom;
 
-        uint32_t _value;
         Addr _src;
+        uint32_t _value;
 
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, uint32_t value, Addr src):
+                            size_t atom, Addr src, uint32_t value):
                             _start(start), _end(end), _step(step),
-                            _atom(atom), _value(value), _src(src)
+                            _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -109,38 +109,34 @@ class PushEngine : public BaseMemoryEngine
         virtual void recvReqRetry();
     };
 
+    ReqPort reqPort;
+
     bool _running;
     int numElementsPerLine;
     CoalesceEngine* peerCoalesceEngine;
 
-    ReqPort reqPort;
-
-    int pushReqQueueSize;
-    int numTotalRetries;
-    int numPendingRetries;
-    std::deque<EdgeReadInfoGen> pushReqQueue;
-
-    // TODO: Add size one size for all these maps
-    std::unordered_map<RequestPtr, Addr> reqOffsetMap;
-    std::unordered_map<RequestPtr, int> reqNumEdgeMap;
-    std::unordered_map<RequestPtr, uint32_t> reqValueMap;
+    int numPendingPulls;
+    int edgePointerQueueSize;
+    std::deque<EdgeReadInfoGen> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;
-    int memRespQueueSize;
-    std::deque<PacketPtr> memRespQueue;
-    std::deque<std::vector<CompleteEdge>> edgeQueue;
+    int edgeQueueSize;
+    std::deque<std::deque<CompleteEdge>> edgeQueue;
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    EventFunctionWrapper nextVertexPullEvent;
+    void processNextVertexPullEvent();
+
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    EventFunctionWrapper nextPushEvent;
+    MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    EventFunctionWrapper nextSendRetryEvent;
-    void processNextSendRetryEvent();
+    bool vertexSpace();
+    bool workLeft();
 
     struct PushStats : public statistics::Group
     {
@@ -166,24 +162,14 @@ class PushEngine : public BaseMemoryEngine
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
 
-    bool allocatePushSpace();
-
-    void deallocatePushSpace(int space);
-
-    void recvWLItem(WorkListItem wl);
-
-    void recvWLItemRetry(WorkListItem wl);
-
     void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
                                           int elements_per_line);
 
-    int getNumRetries() { return numTotalRetries; }
+    void recvReqRetry();
 
-    void start(); // CoalesceEngine announcing work
-    void stop(); // CoalesceEngine announcing no work
+    void start();
     bool running() { return _running; }
-    void recvWLItem2(Addr addr, WorkListItem wl);
-
+    void recvVertexPush(Addr addr, WorkListItem wl);
 };
 
 }

From 34d8bcef6633e9019c3fd4d3921044eb5bebedeb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 22 Aug 2022 11:51:06 -0700
Subject: [PATCH 144/287] Added sim exit functionality. WIP

---
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  7 +++++++
 src/accl/graph/sega/coalesce_engine.hh     |  2 ++
 src/accl/graph/sega/push_engine.cc         | 11 +++++++++++
 src/accl/graph/sega/push_engine.hh         |  8 +++++---
 src/accl/graph/sega/wl_engine.cc           |  6 ++++++
 src/accl/graph/sega/wl_engine.hh           |  3 ++-
 7 files changed, 34 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 102800de92..1f325703bd 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -52,7 +52,7 @@ class CenteralController : public ClockedObject
           RequestPort(name, owner), owner(owner),
           _blocked(false), blockedPacket(nullptr)
         {}
-        // virtual AddrRangeList getAddrRanges() const;
+
         void sendPacket(PacketPtr pkt);
         bool blocked() { return _blocked; }
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dcec2a5f78..57bc99013c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -85,6 +85,13 @@ CoalesceEngine::drain()
     return DrainState::Drained;
 }
 
+bool
+CoalesceEngine::done()
+{
+    return needsPush.none() &&
+        memoryFunctionQueue.empty() && peerWLEngine->done();
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 6969fe2823..b19a1bc461 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,6 +164,8 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int workCount() { return _workCount; }
     void recvVertexPull();
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index b5341b3d61..9866c30f5c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -32,6 +32,7 @@
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -126,6 +127,12 @@ PushEngine::workLeft()
     return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
 }
 
+bool
+PushEngine::done()
+{
+    return edgeQueue.empty() &&
+        edgePointerQueue.empty() && peerCoalesceEngine->done();
+}
 void
 PushEngine::start()
 {
@@ -298,6 +305,10 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
+    if (done()) {
+        exitSimLoopNow(name() + " is done.");
+    }
+
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c79b0de944..a42228f4c0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -126,6 +126,9 @@ class PushEngine : public BaseMemoryEngine
 
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
+    bool vertexSpace();
+    bool workLeft();
+
     EventFunctionWrapper nextVertexPullEvent;
     void processNextVertexPullEvent();
 
@@ -135,9 +138,6 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextPushEvent;
     void processNextPushEvent();
 
-    bool vertexSpace();
-    bool workLeft();
-
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -170,6 +170,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool done();
 };
 
 }
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 12f4548aa2..e999667ad1 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -121,6 +121,12 @@ WLEngine::getAddrRanges() const
     return coalesceEngine->getAddrRanges();
 }
 
+bool
+WLEngine::done()
+{
+    return registerFile.empty() && updateQueue.empty();
+}
+
 // TODO: Parameterize the number of pops WLEngine can do at a time.
 // TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5e8e5b25f3..1360d37132 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -80,7 +80,6 @@ class WLEngine : public BaseReduceEngine
     std::unordered_map<Addr, WorkListItem> workListFile;
 
     void recvFunctional(PacketPtr pkt);
-
     AddrRangeList getAddrRanges() const;
 
     EventFunctionWrapper nextReadEvent;
@@ -116,6 +115,8 @@ class WLEngine : public BaseReduceEngine
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
     int getRegisterFileSize() { return registerFileSize; }
+
+    bool done();
 };
 
 }

From 72cdfa6b3a53b4aaf0447b6a2ff3d7877b68abf1 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 26 Aug 2022 09:54:35 -0700
Subject: [PATCH 145/287] Adding a DDR model to the accelerator

---
 configs/accl/sega.py   | 45 +++++++++++++++++++++++++++++-------------
 src/base/statistics.hh |  2 +-
 2 files changed, 32 insertions(+), 15 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e7a704d477..28f9211045 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -2,6 +2,7 @@
 import argparse
 
 from math import log
+import math
 from m5.objects import *
 from m5.util.convert import toMemorySize
 
@@ -18,7 +19,7 @@ def __init__(self):
         self.coalesce_engine = CoalesceEngine(
                                     peer_push_engine=self.push_engine,
                                     attached_memory_atom_size=32,
-                                    cache_size="8MiB",
+                                    cache_size="16MiB",
                                     num_mshr_entry=32,
                                     num_tgts_per_mshr=16)
         self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
@@ -61,7 +62,7 @@ def __init__(self,
 
         self._edge_chunk_size = int(\
                                 toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(\
+        self._edge_ranges = [AddrRange(
                             start=toMemorySize(vertex_memory_size)+\
                             self._edge_chunk_size*i,\
                             size=self._edge_chunk_size)\
@@ -69,23 +70,39 @@ def __init__(self,
 
         vertex_mem_ctrl = []
         edge_mem_ctrl = []
+        # vertex_mem_ranges = self._vertex_ranges
+        
+
         for i in range(num_channels):
+            # vertex_addr_range = vertex_mem_ranges[i]
+            vertex_interface = DDR4_2400_8x8()
+            vertex_interface.range = self._vertex_ranges[i]
+            ctrl = MemCtrl()
+            ctrl.dram = vertex_interface
             vertex_mem_ctrl.append(
-                SimpleMemory(range=self._vertex_ranges[i],
-                            bandwidth="19.2GB/s",
-                            latency="30ns")
+                ctrl
             )
+
+            edge_interface = DDR4_2400_8x8(
+                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
+            edge_interface.range = AddrRange(self._edge_chunk_size)
+            #                 start=toMemorySize(vertex_memory_size)+\
+            #                 self._edge_chunk_size*i,\
+            #                 size=self._edge_chunk_size)
+            # edge_addr_range = edge_mem_range[0]
+            # edge_interface.range = self._edge_chunk_size
+            edge_ctrl = MemCtrl()
+            edge_ctrl.dram = edge_interface
             edge_mem_ctrl.append(
-                # SimpleMemory(range=self._edge_ranges[i],
-                #             bandwidth="4.8GB/s",
-                #             latency="30ns",
-                #             image_file=f"{graph_path}/edgelist_{i}")
-                SimpleMemory(range=AddrRange(self._edge_chunk_size),
-                            bandwidth="4.8GB/s",
-                            latency="30ns",
-                            image_file=f"{graph_path}/edgelist_{i}",
-                            in_addr_map=False)
+                edge_ctrl
             )
+            # edge_mem_ctrl.append(
+            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
+            #                 bandwidth="4.8GB/s",
+            #                 latency="30ns",
+            #                 image_file=f"{graph_path}/edgelist_{i}",
+            #                 in_addr_map=False)
+            # )
         self.vertex_mem_ctrl = vertex_mem_ctrl
         self.edge_mem_ctrl = edge_mem_ctrl
 
diff --git a/src/base/statistics.hh b/src/base/statistics.hh
index 24cbf714f5..15aeff892e 100644
--- a/src/base/statistics.hh
+++ b/src/base/statistics.hh
@@ -1052,7 +1052,7 @@ class VectorBase : public DataWrapVec<Derived, VectorInfoProxy>
     Proxy
     operator[](off_type index)
     {
-        assert (index < size());
+        // assert (index < size());
         return Proxy(this->self(), index);
     }
 };

From 6d0c4011086f1a9c644accc96943fd2026bba3d2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 28 Aug 2022 21:14:54 -0700
Subject: [PATCH 146/287] Completed sim exit. I think...

---
 configs/accl/sega.py                       | 184 ++++++------------
 src/accl/graph/sega/CenteralController.py  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/MPU.py                 |  47 +++++
 src/accl/graph/sega/PushEngine.py          |   2 -
 src/accl/graph/sega/SConscript             |   2 +
 src/accl/graph/sega/WLEngine.py            |   3 -
 src/accl/graph/sega/centeral_controller.cc |  23 ++-
 src/accl/graph/sega/centeral_controller.hh |  13 +-
 src/accl/graph/sega/coalesce_engine.cc     |  78 ++++----
 src/accl/graph/sega/coalesce_engine.hh     |  11 +-
 src/accl/graph/sega/mpu.cc                 | 206 +++++++++++++++++++++
 src/accl/graph/sega/mpu.hh                 | 135 ++++++++++++++
 src/accl/graph/sega/push_engine.cc         |  73 ++------
 src/accl/graph/sega/push_engine.hh         |  38 +---
 src/accl/graph/sega/wl_engine.cc           | 133 ++++---------
 src/accl/graph/sega/wl_engine.hh           |  43 +----
 17 files changed, 573 insertions(+), 427 deletions(-)
 create mode 100644 src/accl/graph/sega/MPU.py
 create mode 100644 src/accl/graph/sega/mpu.cc
 create mode 100644 src/accl/graph/sega/mpu.hh

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 28f9211045..a0bfb5ddce 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -4,112 +4,8 @@
 from math import log
 import math
 from m5.objects import *
-from m5.util.convert import toMemorySize
 
-class MPU(SubSystem):
-    def __init__(self):
-        super(MPU, self).__init__()
-        self.push_engine = PushEngine(push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64)
-        # self.push_engine = PushEngine(base_edge_addr=base_edge_addr,
-        #                             push_req_queue_size=32,
-        #                             attached_memory_atom_size=64,
-        #                             resp_queue_size=64)
-        self.coalesce_engine = CoalesceEngine(
-                                    peer_push_engine=self.push_engine,
-                                    attached_memory_atom_size=32,
-                                    cache_size="16MiB",
-                                    num_mshr_entry=32,
-                                    num_tgts_per_mshr=16)
-        self.wl_engine = WLEngine(coalesce_engine=self.coalesce_engine,
-                                update_queue_size=64,
-                                register_file_size=32)
-
-    def getRespPort(self):
-        return self.wl_engine.resp_port
-    def setRespPort(self, port):
-        self.wl_engine.resp_port = port
-
-    def getReqPort(self):
-        return self.push_engine.req_port
-    def setReqPort(self, port):
-        self.push_engine.req_port = port
-
-    def getVertexMemPort(self):
-        return self.coalesce_engine.mem_port
-    def setVertexMemPort(self, port):
-        self.coalesce_engine.mem_port = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-class MPUMemory(SubSystem):
-    def __init__(self,
-                    num_channels: int,
-                    cache_line_size: int,
-                    vertex_memory_size: str,
-                    edge_memory_size: str,
-                    graph_path: str):
-        super(MPUMemory, self).__init__()
-
-        self._vertex_ranges = self._interleave_addresses(
-                                AddrRange(start=0, size=vertex_memory_size),\
-                                num_channels,\
-                                cache_line_size)
-
-        self._edge_chunk_size = int(\
-                                toMemorySize(edge_memory_size)/num_channels)
-        self._edge_ranges = [AddrRange(
-                            start=toMemorySize(vertex_memory_size)+\
-                            self._edge_chunk_size*i,\
-                            size=self._edge_chunk_size)\
-                            for i in range(num_channels)]
-
-        vertex_mem_ctrl = []
-        edge_mem_ctrl = []
-        # vertex_mem_ranges = self._vertex_ranges
-        
-
-        for i in range(num_channels):
-            # vertex_addr_range = vertex_mem_ranges[i]
-            vertex_interface = DDR4_2400_8x8()
-            vertex_interface.range = self._vertex_ranges[i]
-            ctrl = MemCtrl()
-            ctrl.dram = vertex_interface
-            vertex_mem_ctrl.append(
-                ctrl
-            )
-
-            edge_interface = DDR4_2400_8x8(
-                image_file = f"{graph_path}/edgelist_{i}", in_addr_map=False)
-            edge_interface.range = AddrRange(self._edge_chunk_size)
-            #                 start=toMemorySize(vertex_memory_size)+\
-            #                 self._edge_chunk_size*i,\
-            #                 size=self._edge_chunk_size)
-            # edge_addr_range = edge_mem_range[0]
-            # edge_interface.range = self._edge_chunk_size
-            edge_ctrl = MemCtrl()
-            edge_ctrl.dram = edge_interface
-            edge_mem_ctrl.append(
-                edge_ctrl
-            )
-            # edge_mem_ctrl.append(
-            #     SimpleMemory(range=AddrRange(self._edge_chunk_size),
-            #                 bandwidth="4.8GB/s",
-            #                 latency="30ns",
-            #                 image_file=f"{graph_path}/edgelist_{i}",
-            #                 in_addr_map=False)
-            # )
-        self.vertex_mem_ctrl = vertex_mem_ctrl
-        self.edge_mem_ctrl = edge_mem_ctrl
-
-    def _interleave_addresses(self,
-                            plain_range,
-                            num_channels,
-                            cache_line_size):
+def interleave_addresses(plain_range, num_channels, cache_line_size):
         intlv_low_bit = log(cache_line_size, 2)
         intlv_bits = log(num_channels, 2)
         ret = []
@@ -123,17 +19,48 @@ def _interleave_addresses(self,
                 intlvMatch=i))
         return ret
 
-    def getVertexPort(self, i):
-        return self.vertex_mem_ctrl[i].port
-    def setVertexPort(self, port, i):
-        self.vertex_mem_ctrl[i].port = port
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(update_queue_size=64,
+                                register_file_size=32)
+        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+                                            cache_size="8MiB",
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=16)
+        self.push_engine = PushEngine(push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64)
+        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="19.2GiB/s")
+        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GiB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False)
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine)
 
-    def getEdgeBaseAddr(self, i):
-        return self._edge_ranges[i].start
-    def getEdgePort(self, i):
-        return self.edge_mem_ctrl[i].port
-    def setEdgePort(self, port, i):
-        self.edge_mem_ctrl[i].port = port
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -158,21 +85,19 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        self.mem_ctrl = MPUMemory(
-                            num_mpus,
-                            self.cache_line_size,
-                            "2GiB",
-                            "14GiB",
-                            graph_path)
+        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
 
-        mpus = []
+        gpts = []
         for i in range(num_mpus):
-            mpus.append(MPU())
-            mpus[i].setReqPort(self.interconnect.cpu_side_ports)
-            mpus[i].setRespPort(self.interconnect.mem_side_ports)
-            mpus[i].setVertexMemPort(self.mem_ctrl.getVertexPort(i))
-            mpus[i].setEdgeMemPort(self.mem_ctrl.getEdgePort(i))
-        self.mpu = mpus
+            gpt = GPT("8GiB")
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
@@ -197,5 +122,4 @@ def get_inputs():
     m5.instantiate()
 
     exit_event = m5.simulate()
-    print(f"Exited simulation because {exit_event.getCause()}")
-    exit()
+    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bd2f6320a8..6f6b12ea2c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -36,7 +36,9 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
     req_port  = RequestPort("Port to send updates to the outside")
-    addr = Param.Addr("")
-    value = Param.Int(0, "")
 
+    mpu_vector = VectorParam.MPU("All mpus in the system.")
+
+    addr = Param.Addr("The addr for the initial update")
+    value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 06c6f92750..14902ef352 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -34,9 +34,6 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/coalesce_engine.hh"
     cxx_class = 'gem5::CoalesceEngine'
 
-    peer_push_engine = Param.PushEngine(NULL, "PushEngine in the same GPT.")
-
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
new file mode 100644
index 0000000000..2d65be2949
--- /dev/null
+++ b/src/accl/graph/sega/MPU.py
@@ -0,0 +1,47 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.SimObject import SimObject
+
+class MPU(SimObject):
+    type = "MPU"
+    cxx_header = "accl/graph/sega/mpu.hh"
+    cxx_class = "gem5::MPU"
+
+    system = Param.System(Parent.any, "System this MPU is a part of")
+
+    in_port = ResponsePort("Port to receive updates from outside")
+    out_port  = RequestPort("Port to send updates to the outside")
+
+    wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
+                                "MPU object.")
+    coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
+                                "each instance of MPU object.")
+    push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
+                                "instance of MPU object.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index a45f5d6ead..f98f22ba9d 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    req_port  = RequestPort("Port to send updates to the outside")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index ae216ccdd4..42a8d84ad5 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -30,12 +30,14 @@ Import('*')
 SimObject('BaseMemoryEngine.py')
 SimObject('CenteralController.py')
 SimObject('CoalesceEngine.py')
+SimObject("MPU.py")
 SimObject('PushEngine.py')
 SimObject('WLEngine.py')
 
 Source('base_memory_engine.cc')
 Source('centeral_controller.cc')
 Source('coalesce_engine.cc')
+Source("mpu.cc")
 Source('push_engine.cc')
 Source('wl_engine.cc')
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 98089328f4..52ca031260 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,9 +34,6 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
-    resp_port = ResponsePort("Port to Receive updates from outside")
-    coalesce_engine = Param.CoalesceEngine(NULL, "The CoalesceEngine "
-                                    "this WLEngine is connected to.")
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
     register_file_size = Param.Int("Number of internal registers the "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index f19c93ebac..5ce7228abb 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,10 +28,13 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
@@ -43,7 +46,12 @@ CenteralController::CenteralController
     reqPort(name() + ".req_port", this),
     addr(params.addr),
     value(params.value)
-{}
+{
+    for (auto mpu : params.mpu_vector) {
+        mpuVector.push_back(mpu);
+        mpu->registerCenteralController(this);
+    }
+}
 
 Port&
 CenteralController::getPort(const std::string &if_name, PortID idx)
@@ -143,4 +151,17 @@ CenteralController::functionalAccess(PacketPtr pkt)
     reqPort.sendFunctional(pkt);
 }
 
+void
+CenteralController::recvDoneSignal()
+{
+    bool done = true;
+    for (auto mpu : mpuVector) {
+        done &= mpu->done();
+    }
+
+    if (done) {
+        exitSimLoopNow("no update left to process.");
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f325703bd..c54c4c04ef 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -29,7 +29,10 @@
 #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,20 +70,20 @@ class CenteralController : public ClockedObject
     Addr addr;
     uint32_t value;
 
+    std::vector<MPU*> mpuVector;
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
-
-    virtual void initState();
-    virtual void startup();
-
     void functionalAccess(PacketPtr pkt);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
     Port& getPort(const std::string &if_name,
                 PortID idx=InvalidPortID) override;
+    virtual void initState();
+    virtual void startup();
+
+    void recvDoneSignal();
 };
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 57bc99013c..d791926fe1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -30,7 +30,7 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/wl_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
 #include "debug/BitVector.hh"
@@ -38,16 +38,16 @@
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params),
-    peerPushEngine(params.peer_push_engine),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
@@ -66,30 +66,20 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-
-    peerPushEngine->registerCoalesceEngine(this, numElementsPerLine);
-
     needsPush.reset();
 }
 
 void
-CoalesceEngine::registerWLEngine(WLEngine* wl_engine)
+CoalesceEngine::registerMPU(MPU* mpu)
 {
-    peerWLEngine = wl_engine;
-}
-
-DrainState
-CoalesceEngine::drain()
-{
-    DPRINTF(CoalesceEngine, "%s: drain called.\n");
-    return DrainState::Drained;
+    owner = mpu;
 }
 
 bool
 CoalesceEngine::done()
 {
-    return needsPush.none() &&
-        memoryFunctionQueue.empty() && peerWLEngine->done();
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -153,17 +143,15 @@ CoalesceEngine::recvWLRead(Addr addr)
         responseQueue.push_back(std::make_tuple(addr,
                     cacheBlocks[block_index].items[wl_offset]));
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -418,6 +406,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         return true;
     }
 
+    onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
 
@@ -439,7 +428,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (needsPush[it + i] == 1) {
                 _workCount--;
                 needsPush[it + i] = 0;
-                peerPushEngine->recvVertexPush(vertex_addr, items[i]);
+                owner->recvVertexPush(vertex_addr, items[i]);
                 break;
             }
         }
@@ -492,17 +481,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             responseQueue.push_back(std::make_tuple(miss_addr,
                     cacheBlocks[block_index].items[wl_offset]));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, miss_addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d, "
-                        "responseQueueSize = %d.\n", __func__, addr,
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, addr,
                         cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size(),
-                        peerWLEngine->getRegisterFileSize());
+                        responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
             cacheBlocks[block_index].lastChangedTick = curTick();
@@ -548,18 +535,18 @@ CoalesceEngine::processNextResponseEvent()
     WorkListItem worklist_response;
 
     std::tie(addr_response, worklist_response) = responseQueue.front();
-    peerWLEngine->handleIncomingWL(addr_response, worklist_response);
+    owner->handleIncomingWL(addr_response, worklist_response);
     DPRINTF(CoalesceEngine,
                 "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
                 __func__, worklist_response.to_string(), addr_response);
 
     responseQueue.pop_front();
     DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
     DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d, responseQueueSize = %d.\n", __func__,
-                responseQueue.size(), peerWLEngine->getRegisterFileSize());
+                "responseQueue.size = %d.\n", __func__,
+                responseQueue.size());
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -720,8 +707,8 @@ CoalesceEngine::processNextApplyEvent()
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
                     }
-                    if (!peerPushEngine->running()) {
-                        peerPushEngine->start();
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
@@ -760,6 +747,10 @@ CoalesceEngine::processNextApplyEvent()
         (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -816,6 +807,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
 
     memPort.sendPacket(pkt);
+    onTheFlyReqs++;
 }
 
 void
@@ -845,6 +837,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
         cacheBlocks[block_index].needsWB = false;
         cacheBlocks[block_index].pendingWB = false;
 
@@ -955,7 +948,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
                 if (needsPush[slice_base + i] == 1) {
                     _workCount--;
                     needsPush[slice_base + i] = 0;
-                    peerPushEngine->recvVertexPush(vertex_addr,
+                    owner->recvVertexPush(vertex_addr,
                                             cacheBlocks[block_index].items[i]);
                     break;
                 }
@@ -967,6 +960,7 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
+            onTheFlyReqs++;
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b19a1bc461..03b463e570 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -33,7 +33,6 @@
 
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/push_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -43,7 +42,7 @@
 namespace gem5
 {
 
-class WLEngine;
+class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
 {
@@ -93,14 +92,13 @@ class CoalesceEngine : public BaseMemoryEngine
       bool isRetry;
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
-
-    WLEngine* peerWLEngine;
-    PushEngine* peerPushEngine;
+    MPU* owner;
 
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
 
+    int onTheFlyReqs;
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
@@ -156,11 +154,10 @@ class CoalesceEngine : public BaseMemoryEngine
   public:
     PARAMS(CoalesceEngine);
     CoalesceEngine(const Params &params);
-    virtual DrainState drain() override;
+    void registerMPU(MPU* mpu);
 
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
-    void registerWLEngine(WLEngine* wl_engine);
 
     int workCount() { return _workCount; }
     void recvVertexPull();
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
new file mode 100644
index 0000000000..7b1727587a
--- /dev/null
+++ b/src/accl/graph/sega/mpu.cc
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/mpu.hh"
+
+#include "accl/graph/sega/centeral_controller.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+MPU::MPU(const Params& params):
+    SimObject(params),
+    system(params.system),
+    wlEngine(params.wl_engine),
+    coalesceEngine(params.coalesce_engine),
+    pushEngine(params.push_engine),
+    inPort(name() + ".inPort", this),
+    outPort(name() + ".outPort", this)
+{
+    wlEngine->registerMPU(this);
+    coalesceEngine->registerMPU(this);
+    pushEngine->registerMPU(this);
+}
+
+Port&
+MPU::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_port") {
+        return inPort;
+    } else if (if_name == "out_port") {
+        return outPort;
+    } else {
+        return SimObject::getPort(if_name, idx);
+    }
+}
+
+void
+MPU::init()
+{
+    localAddrRange = getAddrRanges();
+    inPort.sendRangeChange();
+}
+
+void
+MPU::registerCenteralController(CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
+AddrRangeList
+MPU::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+MPU::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+MPU::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+MPU::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+MPU::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+MPU::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+MPU::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    } else {
+        owner->recvReqRetry();
+    }
+}
+
+bool
+MPU::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+MPU::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+}
+
+bool
+MPU::handleIncomingUpdate(PacketPtr pkt)
+{
+    return wlEngine->handleIncomingUpdate(pkt);
+}
+
+void
+MPU::handleIncomingWL(Addr addr, WorkListItem wl)
+{
+    wlEngine->handleIncomingWL(addr, wl);
+}
+
+void
+MPU::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    coalesceEngine->recvWLWrite(addr, wl);
+}
+
+void
+MPU::recvVertexPush(Addr addr, WorkListItem wl)
+{
+    pushEngine->recvVertexPush(addr, wl);
+}
+
+void
+MPU::sendPacket(PacketPtr pkt)
+{
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(pkt->getAddr());
+    }
+
+    if (found_locally) {
+        // TODO: count number of local updates
+
+    } else {
+        // TOOD: count number of remote updates
+
+    }
+
+    outPort.sendPacket(pkt);
+}
+
+void
+MPU::recvDoneSignal()
+{
+    centeralController->recvDoneSignal();
+}
+
+bool
+MPU::done()
+{
+    return wlEngine->done() && coalesceEngine->done() && pushEngine->done();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
new file mode 100644
index 0000000000..edf0350caf
--- /dev/null
+++ b/src/accl/graph/sega/mpu.hh
@@ -0,0 +1,135 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_MPU_HH__
+#define __ACCL_GRAPH_SEGA_MPU_HH__
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/push_engine.hh"
+#include "accl/graph/sega/wl_engine.hh"
+#include "base/addr_range.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+#include "sim/sim_object.hh"
+#include "sim/system.hh"
+#include "params/MPU.hh"
+
+namespace gem5
+{
+
+class CenteralController;
+
+class MPU : public SimObject
+{
+  private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        MPU* owner;
+        bool needSendRetryReq;
+
+      public:
+        RespPort(const std::string& name, MPU* owner):
+          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class ReqPort : public RequestPort
+    {
+      private:
+        MPU* owner;
+        PacketPtr blockedPacket;
+
+      public:
+        ReqPort(const std::string& name, MPU* owner) :
+          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    System* system;
+    CenteralController* centeralController;
+
+    WLEngine* wlEngine;
+    CoalesceEngine* coalesceEngine;
+    PushEngine* pushEngine;
+
+    RespPort inPort;
+    ReqPort outPort;
+
+    AddrRangeList localAddrRange;
+
+  public:
+    PARAMS(MPU);
+    MPU(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
+    void registerCenteralController(CenteralController* centeral_controller);
+
+    AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+
+    bool handleIncomingUpdate(PacketPtr pkt);
+    void checkRetryReq() { inPort.checkRetryReq(); }
+    void handleIncomingWL(Addr addr, WorkListItem wl);
+    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return coalesceEngine->workCount(); }
+    void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
+    bool running() { return pushEngine->running(); }
+    void start() { return pushEngine->start(); }
+    void recvVertexPush(Addr addr, WorkListItem wl);
+
+    bool blocked() { return outPort.blocked(); }
+    void sendPacket(PacketPtr pkt);
+    void recvReqRetry() { pushEngine->recvReqRetry(); }
+
+    void recvDoneSignal();
+    bool done();
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_MPU_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9866c30f5c..0134133cfa 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
@@ -37,9 +38,8 @@
 namespace gem5
 {
 
-PushEngine::PushEngine(const Params &params):
+PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
-    reqPort(name() + ".req_port", this),
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
@@ -49,22 +49,10 @@ PushEngine::PushEngine(const Params &params):
     stats(*this)
 {}
 
-Port&
-PushEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return BaseMemoryEngine::getPort(if_name, idx);
-    }
-}
-
 void
-PushEngine::registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                    int elements_per_line)
+PushEngine::registerMPU(MPU* mpu)
 {
-    peerCoalesceEngine = coalesce_engine;
-    numElementsPerLine = elements_per_line;
+    owner = mpu;
 }
 
 void
@@ -77,43 +65,6 @@ PushEngine::recvReqRetry()
     }
 }
 
-void
-PushEngine::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    DPRINTF(PushEngine, "%s: Sending pakcet: %s to "
-                "the network.\n", __func__, pkt->print());
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-        DPRINTF(PushEngine, "%s: MemPort blocked.\n", __func__);
-    } else {
-        DPRINTF(PushEngine, "%s: Packet sent successfully.\n", __func__);
-        owner->recvReqRetry();
-    }
-}
-
-bool
-PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-PushEngine::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
-
-    _blocked = false;
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -124,15 +75,17 @@ PushEngine::vertexSpace()
 bool
 PushEngine::workLeft()
 {
-    return ((peerCoalesceEngine->workCount() - numPendingPulls) > 0);
+    return ((owner->workCount() - numPendingPulls) > 0);
 }
 
 bool
 PushEngine::done()
 {
     return edgeQueue.empty() &&
-        edgePointerQueue.empty() && peerCoalesceEngine->done();
+            (onTheFlyMemReqs == 0) &&
+            edgePointerQueue.empty();
 }
+
 void
 PushEngine::start()
 {
@@ -152,7 +105,7 @@ PushEngine::processNextVertexPullEvent()
 {
     // TODO: change edgePointerQueueSize
     numPendingPulls++;
-    peerCoalesceEngine->recvVertexPull();
+    owner->recvVertexPull();
 
     if (!workLeft()) {
         _running = false;
@@ -277,7 +230,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 void
 PushEngine::processNextPushEvent()
 {
-    if (reqPort.blocked()) {
+    if (owner->blocked()) {
         nextPushEvent.sleep();
         return;
     }
@@ -293,7 +246,7 @@ PushEngine::processNextPushEvent()
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
-    reqPort.sendPacket(update);
+    owner->sendPacket(update);
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
@@ -305,10 +258,6 @@ PushEngine::processNextPushEvent()
         edgeQueue.pop_front();
     }
 
-    if (done()) {
-        exitSimLoopNow(name() + " is done.");
-    }
-
     assert(!nextPushEvent.pending());
     assert(!nextPushEvent.scheduled());
     if (!edgeQueue.empty()) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a42228f4c0..6f92b62be0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -38,6 +38,7 @@ namespace gem5
 {
 
 class CoalesceEngine;
+class MPU;
 
 class PushEngine : public BaseMemoryEngine
 {
@@ -89,31 +90,9 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        PushEngine* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, PushEngine* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
-    ReqPort reqPort;
-
     bool _running;
     int numElementsPerLine;
-    CoalesceEngine* peerCoalesceEngine;
+    MPU* owner;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -157,20 +136,15 @@ class PushEngine : public BaseMemoryEngine
 
   public:
     PARAMS(PushEngine);
-    PushEngine(const Params &params);
-
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-
-    void registerCoalesceEngine(CoalesceEngine* coalesce_engine,
-                                          int elements_per_line);
-
-    void recvReqRetry();
+    PushEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
+    void recvReqRetry();
+
     bool done();
 };
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index e999667ad1..9890eeed76 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,103 +28,61 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include "accl/graph/sega/mpu.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
 #include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
-WLEngine::WLEngine(const WLEngineParams &params):
+WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
-    respPort(name() + ".resp_port", this),
-    coalesceEngine(params.coalesce_engine),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{
-    coalesceEngine->registerWLEngine(this);
-}
-
-Port&
-WLEngine::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "resp_port") {
-        return respPort;
-    } else {
-        return BaseReduceEngine::getPort(if_name, idx);
-    }
-}
+{}
 
 void
-WLEngine::init()
+WLEngine::registerMPU(MPU* mpu)
 {
-    respPort.sendRangeChange();
+    owner = mpu;
 }
 
-AddrRangeList
-WLEngine::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-WLEngine::RespPort::checkRetryReq()
+bool
+WLEngine::done()
 {
-    if (needSendRetryReq) {
-        DPRINTF(WLEngine,  "%s: Sending a RetryReq.\n", __func__);
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
+    return registerFile.empty() && updateQueue.empty();
 }
 
 bool
-WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
+    assert(updateQueue.size() <= updateQueueSize);
+    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }
 
-    return true;
-}
-
-Tick
-WLEngine::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-WLEngine::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-WLEngine::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
-void
-WLEngine::recvFunctional(PacketPtr pkt)
-{
-    coalesceEngine->recvFunctional(pkt);
-}
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
+    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
+                updateQueue.size(), updateQueueSize);
 
-AddrRangeList
-WLEngine::getAddrRanges() const
-{
-    return coalesceEngine->getAddrRanges();
-}
+    // delete the packet since it's not needed anymore.
+    delete pkt;
 
-bool
-WLEngine::done()
-{
-    return registerFile.empty() && updateQueue.empty();
+    if (!nextReadEvent.scheduled()) {
+        schedule(nextReadEvent, nextCycle());
+    }
+    return true;
 }
 
 // TODO: Parameterize the number of pops WLEngine can do at a time.
@@ -150,7 +108,7 @@ WLEngine::processNextReadEvent()
             // return a boolean value. It should return an integer/enum
             // to tell WLEngine why it rejected the read request. Their might
             // be things that WLEngine can do to fix head of the line blocking.
-            if (coalesceEngine->recvWLRead(update_addr)) {
+            if (owner->recvWLRead(update_addr)) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -171,7 +129,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                respPort.checkRetryReq();
+                owner->checkRetryReq();
             }
         }
     } else {
@@ -194,7 +152,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        respPort.checkRetryReq();
+        owner->checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
@@ -238,7 +196,7 @@ WLEngine::processNextReduceEvent()
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
 
-        coalesceEngine->recvWLWrite(addr, workListFile[addr]);
+        owner->recvWLWrite(addr, workListFile[addr]);
         registerFile.erase(addr);
         DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
                     "registerFile.size = %d, registerFileSize = %d\n",
@@ -248,40 +206,15 @@ WLEngine::processNextReduceEvent()
                     __func__, addr, registerFile.size(), registerFileSize);
     }
     workListFile.clear();
-}
 
-bool
-WLEngine::handleIncomingUpdate(PacketPtr pkt)
-{
-    assert(updateQueue.size() <= updateQueueSize);
-    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
-        return false;
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
-    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-
-
-    // delete the packet since it's not needed anymore.
-    delete pkt;
-
-    if (!nextReadEvent.scheduled()) {
-        schedule(nextReadEvent, nextCycle());
-    }
-    return true;
 }
 
 WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     : statistics::Group(&_wl),
     wl(_wl),
-
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 1360d37132..4a0489b123 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -34,42 +34,18 @@
 
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/data_structs.hh"
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
 namespace gem5
 {
 
+class MPU;
+
 class WLEngine : public BaseReduceEngine
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        WLEngine* owner;
-        bool needSendRetryReq;
-
-      public:
-        RespPort(const std::string& name, WLEngine* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
-    virtual void init();
-
-    RespPort respPort;
-
-    CoalesceEngine* coalesceEngine;
+    MPU* owner;
 
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
@@ -79,9 +55,6 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    void recvFunctional(PacketPtr pkt);
-    AddrRangeList getAddrRanges() const;
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 
@@ -104,18 +77,12 @@ class WLEngine : public BaseReduceEngine
 
   public:
     PARAMS(WLEngine);
-
-    WLEngine(const WLEngineParams &params);
-
-    Port& getPort(const std::string &if_name,
-                  PortID idx=InvalidPortID) override;
+    WLEngine(const Params& params);
+    void registerMPU(MPU* mpu);
 
     bool handleIncomingUpdate(PacketPtr pkt);
-
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
-    int getRegisterFileSize() { return registerFileSize; }
-
     bool done();
 };
 

From 86b82a7286a47a66c9df0b75ef6501d56cefaea3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:27 -0700
Subject: [PATCH 147/287] Minor improvements in the code.

---
 src/accl/graph/sega/coalesce_engine.cc | 60 ++++++++------------------
 src/accl/graph/sega/coalesce_engine.hh |  7 ++-
 2 files changed, 22 insertions(+), 45 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d791926fe1..ba7878be7a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -140,8 +140,9 @@ CoalesceEngine::recvWLRead(Addr addr)
         // TODO: Add a hit latency as a param for this object.
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
-        responseQueue.push_back(std::make_tuple(addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset]));
+
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
@@ -434,6 +435,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
         DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
                             __func__, needsPush.count());
+
+        pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
     }
@@ -466,12 +469,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         delete pkt;
     }
 
-    // FIXME: Get rid of servicedIndices (maybe use an iterator)
-    std::vector<int> servicedIndices;
-    for (int i = 0; i < MSHR[block_index].size(); i++) {
-        Addr miss_addr = MSHR[block_index][i];
+    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+        Addr miss_addr = *it;
         Addr aligned_miss_addr =
             roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
         if (aligned_miss_addr == addr) {
             int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
             DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
@@ -495,28 +497,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
-            // End of the said block
-            servicedIndices.push_back(i);
-            // DPRINTF(CoalesceEngine,  "%s: Added index: %d of MSHR for cacheBlocks[%d] for "
-            //             "removal.\n", __func__, i, block_index);
+            it = MSHR[block_index].erase(it);
+        } else {
+            it++;
         }
     }
 
-    // TODO: We Can use taken instead of this
-    // TODO: Change the MSHR from map<Addr, vector> to map<Addr, list>
-    int bias = 0;
-    for (int i = 0; i < servicedIndices.size(); i++) {
-        Addr print_addr = MSHR[block_index][i - bias];
-        MSHR[block_index].erase(MSHR[block_index].begin() +
-                                    servicedIndices[i] - bias);
-        bias++;
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu has been serviced "
-                        "and is removed.\n", __func__, print_addr);
-    }
-
     if (MSHR[block_index].empty()) {
         MSHR.erase(block_index);
-        // cacheBlocks[block_index].hasConflict = false;
     }
 
     if ((!nextResponseEvent.scheduled()) &&
@@ -902,24 +890,8 @@ CoalesceEngine::getOptimalBitVectorSlice()
             (!cacheBlocks[block_index].pendingWB)) {
             assert(!cacheBlocks[block_index].needsApply);
             assert(!cacheBlocks[block_index].pendingData);
-            // current_score += numElementsPerLine * 2;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = true;
-            //     if (score == max_score_possible) {
-            //         break;
-            //     }
-            // }
             return std::make_tuple(true, it);
         } else if (cacheBlocks[block_index].addr != addr) {
-            // score += numElementsPerLine;
-            // if (current_score > score) {
-            //     score = current_score;
-            //     slice_base = it;
-            //     hit_in_cache = false;
-            //     assert(score < max_score_possible);
-            // }
             return std::make_tuple(false, it);
         }
     }
@@ -928,7 +900,7 @@ CoalesceEngine::getOptimalBitVectorSlice()
 }
 
 void
-CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
@@ -961,6 +933,8 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
+
+            pendingVertexPullReads.insert(addr);
             // TODO: Set a tracking structure so that nextMemoryReadEvent knows
             // It does not have to read this address anymore. It can simply set
             // a flag to true (maybe not even needed just look if the cache has a
@@ -972,9 +946,9 @@ CoalesceEngine::processNextPushRetry(int prev_slice_base, Tick schedule_tick)
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
-            processNextPushRetry(slice_base, schedule_tick);
+            processNextVertexPull(slice_base, schedule_tick);
         }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextPushRetry with input "
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
                                     "0 to memoryFunctionQueue.\n", __func__);
     }
 }
@@ -999,7 +973,7 @@ CoalesceEngine::recvVertexPull()
     numPullsReceived++;
     memoryFunctionQueue.emplace_back(
         [this] (int slice_base, Tick schedule_tick) {
-        processNextPushRetry(slice_base, schedule_tick);
+        processNextVertexPull(slice_base, schedule_tick);
     }, 0, curTick());
     if ((!nextMemoryEvent.pending()) &&
         (!nextMemoryEvent.scheduled())) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 03b463e570..75c36f9c03 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,12 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<bool, int> getOptimalBitVectorSlice();
 
+    std::unordered_set<Addr> pendingVertexPullReads;
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextPushRetry(int slice_base, Tick schedule_tick);
-    std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();

From 8bbe1cd51f5d04ddb366519316e4427840c69943 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:00:19 -0700
Subject: [PATCH 148/287] Added HBM as vertex memory. It doesn't exit!

---
 configs/accl/sega.py | 36 +++++++++++++++++++++++++-----------
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a0bfb5ddce..2c44c1f7eb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,20 +20,26 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
-                                            cache_size="8MiB",
+                                            cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=16)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="19.2GiB/s")
+        
+        vertex_interface = HBM_1000_4H_1x128()
+        # vertex_interface.range = self._vertex_ranges[i]
+        ctrl = MemCtrl()
+        ctrl.dram = vertex_interface
+        self.vertex_mem_ctrl = ctrl
+        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
+        #                                     latency_var="0ns",
+        #                                     bandwidth="19.2GiB/s")
         self.edge_mem_ctrl = SimpleMemory(latency="30ns",
                                         latency_var="0ns",
                                         bandwidth="19.2GiB/s",
@@ -58,7 +64,8 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
+        # self.vertex_mem_ctrl.range = vertex_range
+        self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
@@ -66,6 +73,7 @@ class SEGA(System):
     def __init__(self,
                 num_mpus,
                 vertex_cache_line_size,
+                cache_size,
                 graph_path,
                 first_addr,
                 first_value):
@@ -85,11 +93,15 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
+        vertex_ranges = interleave_addresses(
+                                AddrRange(start=0, size="4GiB"),\
+                                num_mpus,\
+                                vertex_cache_line_size)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB")
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -103,19 +115,21 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
     argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    return args.num_mpus, args.vertex_cache_line_size, \
+    print("******* ", args.cache_size)
+    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, \
+    num_mpus, vertex_cache_line_size, cache_size, \
         graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, \
+    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
                 graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 

From 25ded8a0636ea641d9da9a8cbe913f91e9f0c08b Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:24:19 -0700
Subject: [PATCH 149/287] Adding Real memory for EM

---
 configs/accl/sega.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2c44c1f7eb..e9286deafc 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size, cache_size: str, i):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -40,11 +40,13 @@ def __init__(self, edge_memory_size, cache_size: str):
         # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
         #                                     latency_var="0ns",
         #                                     bandwidth="19.2GiB/s")
-        self.edge_mem_ctrl = SimpleMemory(latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="19.2GiB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False)
+        edge_interface = DDR4_2400_8x8(
+                device_size = edge_memory_size, 
+                image_file = f"{graph_path}/edgelist_{i}", 
+                in_addr_map=False)
+        edge_ctrl = MemCtrl()
+        edge_ctrl.dram = edge_interface
+        self.edge_mem_ctrl = edge_ctrl
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -67,7 +69,7 @@ def set_vertex_range(self, vertex_range):
         # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self,
@@ -101,7 +103,7 @@ def __init__(self,
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("8GiB", cache_size, i)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)

From 0f69be29a97f915680b809fb3febc19543c60c99 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:38:00 -0700
Subject: [PATCH 150/287] Fixing style.

---
 configs/accl/sega.py | 37 ++++++++++++++-----------------------
 1 file changed, 14 insertions(+), 23 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e9286deafc..1e360676cb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str, i):
+    def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
@@ -31,18 +31,14 @@ def __init__(self, edge_memory_size, cache_size: str, i):
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
-        
-        vertex_interface = HBM_1000_4H_1x128()
-        # vertex_interface.range = self._vertex_ranges[i]
+
+        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
         ctrl = MemCtrl()
         ctrl.dram = vertex_interface
         self.vertex_mem_ctrl = ctrl
-        # self.vertex_mem_ctrl = SimpleMemory(latency="30ns",
-        #                                     latency_var="0ns",
-        #                                     bandwidth="19.2GiB/s")
+
         edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size, 
-                image_file = f"{graph_path}/edgelist_{i}", 
+                device_size = edge_memory_size,
                 in_addr_map=False)
         edge_ctrl = MemCtrl()
         edge_ctrl.dram = edge_interface
@@ -74,7 +70,6 @@ def set_edge_image(self, edge_image):
 class SEGA(System):
     def __init__(self,
                 num_mpus,
-                vertex_cache_line_size,
                 cache_size,
                 graph_path,
                 first_addr,
@@ -83,7 +78,7 @@ def __init__(self,
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = vertex_cache_line_size
+        self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.interconnect = NoncoherentXBar(frontend_latency=1,
@@ -95,15 +90,14 @@ def __init__(self,
                                     image_file=f"{graph_path}/vertices")
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
-        # vertex_ranges = interleave_addresses(AddrRange("4GiB"), num_mpus, vertex_cache_line_size)
         vertex_ranges = interleave_addresses(
-                                AddrRange(start=0, size="4GiB"),\
-                                num_mpus,\
-                                vertex_cache_line_size)
+                            AddrRange(start=0, size="4GiB"),
+                            num_mpus,
+                            32)
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size, i)
+            gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpt.setReqPort(self.interconnect.cpu_side_ports)
@@ -116,23 +110,20 @@ def __init__(self,
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_mpus", type=int)
-    argparser.add_argument("vertex_cache_line_size", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph_path", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     args = argparser.parse_args()
-    print("******* ", args.cache_size)
-    return args.num_mpus, args.vertex_cache_line_size, args.cache_size, \
+
+    return args.num_mpus, args.cache_size, \
             args.graph_path, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    num_mpus, vertex_cache_line_size, cache_size, \
-        graph_path, first_addr, first_value = get_inputs()
+    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
 
     print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, vertex_cache_line_size, cache_size, \
-                graph_path, first_addr, first_value)
+    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 16bb60f064fadacb1a8cb62eaf6bc0d0a6aacffd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 1 Sep 2022 21:44:37 -0700
Subject: [PATCH 151/287] Khoshgelation.

---
 configs/accl/sega.py | 19 ++++++-------------
 1 file changed, 6 insertions(+), 13 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1e360676cb..b023507a39 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -22,27 +22,21 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
+        self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=16)
+                                            num_tgts_per_mshr=32)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
 
-        vertex_interface = HBM_1000_4H_1x128(burst_length=2)
-        ctrl = MemCtrl()
-        ctrl.dram = vertex_interface
-        self.vertex_mem_ctrl = ctrl
+        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
-        edge_interface = DDR4_2400_8x8(
-                device_size = edge_memory_size,
-                in_addr_map=False)
-        edge_ctrl = MemCtrl()
-        edge_ctrl.dram = edge_interface
-        self.edge_mem_ctrl = edge_ctrl
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -62,7 +56,6 @@ def setReqPort(self, port):
         self.mpu.out_port = port
 
     def set_vertex_range(self, vertex_range):
-        # self.vertex_mem_ctrl.range = vertex_range
         self.vertex_mem_ctrl.dram.range = vertex_range
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image

From 99f997f387edb67177ee3789522db4d0f0f986be Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 2 Sep 2022 07:47:19 -0700
Subject: [PATCH 152/287] Adding new stats.

---
 configs/accl/sega.py                   |  3 +-
 src/accl/graph/sega/CoalesceEngine.py  |  2 +
 src/accl/graph/sega/coalesce_engine.cc | 71 +++++++++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  8 +--
 4 files changed, 56 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b023507a39..5cf557719f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -27,7 +27,8 @@ def __init__(self, edge_memory_size, cache_size: str):
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
-                                            num_tgts_per_mshr=32)
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4)
         self.push_engine = PushEngine(push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64)
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 14902ef352..2cc756ff3f 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,3 +37,5 @@ class CoalesceEngine(BaseMemoryEngine):
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
     num_mshr_entry = Param.Int("Number of MSHR entries.")
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ba7878be7a..1715d637f1 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,6 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -141,7 +142,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // Can't just schedule the nextResponseEvent for latency cycles in
         // the future.
         responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset]));
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
@@ -197,6 +198,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                             "cacheBlocks[%d].\n", __func__, block_index);
         }
         MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -312,6 +314,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     // cacheBlocks[block_index].hasConflict = true;
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     stats.readMisses++;
@@ -344,6 +347,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                     DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
                                 " Addr: %lu.\n", __func__, block_index, addr);
                     MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
                     DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
                         "for cacheBlocks[%d].\n", __func__, addr, block_index);
                     memoryFunctionQueue.emplace_back(
@@ -382,11 +386,11 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
                             "for cacheBlocks[%d].\n", __func__, block_index);
 
-            // cacheBlocks[block_index].hasConflict = true;
             // TODO: Might want to differentiate between different misses.
             stats.readMisses++;
 
             MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
             DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
                             "cacheBlocks[%d].\n", __func__, addr, block_index);
             stats.numVertexReads++;
@@ -481,7 +485,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         "packet.\n",__func__, miss_addr, block_index);
             // TODO: Make this block of code into a function
             responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset]));
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
@@ -519,22 +523,36 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 void
 CoalesceEngine::processNextResponseEvent()
 {
+    int num_responses_sent = 0;
+
     Addr addr_response;
     WorkListItem worklist_response;
-
-    std::tie(addr_response, worklist_response) = responseQueue.front();
-    owner->handleIncomingWL(addr_response, worklist_response);
-    DPRINTF(CoalesceEngine,
-                "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                __func__, worklist_response.to_string(), addr_response);
-
-    responseQueue.pop_front();
-    DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
-    DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                "responseQueue.size = %d.\n", __func__,
-                responseQueue.size());
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__, worklist_response.to_string(), addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        if ((num_responses_sent >= maxRespPerCycle) ||
+            (responseQueue.empty())) {
+                break;
+        }
+    }
 
     if ((!nextResponseEvent.scheduled()) &&
         (!responseQueue.empty())) {
@@ -694,9 +712,9 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
-                    }
-                    if (!owner->running()) {
-                        owner->start();
+                        if (!owner->running()) {
+                            owner->start();
+                        }
                     }
                 }
             }
@@ -997,10 +1015,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(readRejections, statistics::units::Count::get(),
              "Number of cache rejections."),
-    ADD_STAT(falseApplySchedules, statistics::units::Count::get(),
-             "Number of failed apply schedules."),
-    ADD_STAT(falseEvictSchedules, statistics::units::Count::get(),
-             "Number of failed evict schedules.")
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries.")
 {
 }
 
@@ -1008,6 +1026,11 @@ void
 CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
+
+    mshrEntryLength.init(64);
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 75c36f9c03..641ed327bb 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -102,7 +102,8 @@ class CoalesceEngine : public BaseMemoryEngine
     int numMSHREntries;
     int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
-    std::deque<std::tuple<Addr, WorkListItem>> responseQueue;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     int _workCount;
     int numPullsReceived;
@@ -144,8 +145,9 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar readRejections;
-      statistics::Scalar falseApplySchedules;
-      statistics::Scalar falseEvictSchedules;
+
+      statistics::Formula hitRate;
+      statistics::Histogram mshrEntryLength;
     };
 
     CoalesceStats stats;

From c8a4614a803d97b0c714637cc3196e8df646338a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 4 Sep 2022 20:42:43 -0700
Subject: [PATCH 153/287] Fixing asserion error on busyMask.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/busyMaskErr        | 16 ++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |  7 ++++++-
 3 files changed, 23 insertions(+), 2 deletions(-)
 create mode 100644 src/accl/graph/sega/busyMaskErr

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 5cf557719f..3fa5b99b3a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -20,7 +20,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
         return ret
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size, cache_size: str):
+    def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(update_queue_size=32,
                                 register_file_size=32)
diff --git a/src/accl/graph/sega/busyMaskErr b/src/accl/graph/sega/busyMaskErr
new file mode 100644
index 0000000000..316fcd37d9
--- /dev/null
+++ b/src/accl/graph/sega/busyMaskErr
@@ -0,0 +1,16 @@
+gem5/build/NULL/gem5.opt -re --outdir=debug --debug-flags=CacheBlockState gem5/configs/accl/sega.py 1 1KiB /home/fariborz/SEGA/graphs/test/scale_21/binaries/mpu_1/ 0 0
+
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964143000: system.gpts.coalesce_engine: handleMemResp: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 2, valid: true, needsApply: false, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964143000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964145000}.
+32964145000: system.gpts.coalesce_engine: recvWLWrite: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: true, needsWB: false, pendingData: false, pendingApply: true, pendingWB: false, lastChangedTick: 32964145000}.
+32964146000: system.gpts.coalesce_engine: processNextApplyEvent: cacheBlock[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 0, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: true, lastChangedTick: 32964146000}.
+32964146000: system.gpts.coalesce_engine: recvWLRead: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+32964147000: system.gpts.coalesce_engine: processNextWriteBack: cacheBlocks[2]: CacheBlock{addr: 469056, busyMask: 1, valid: true, needsApply: false, needsWB: true, pendingData: false, pendingApply: false, pendingWB: false, lastChangedTick: 32964146000}.
+
+// This assertion would be hit although it should not.
+// It is fixed by a hack in recvWLRead when hit in the cache.
+assert(cacheBlocks[block_index].busyMask == 0);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1715d637f1..3ff867c274 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -162,7 +162,12 @@ CoalesceEngine::recvWLRead(Addr addr)
         // and skip the process if the respective bit is set to false.
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: If a read happens on the same cycle as another operation such
+        // apply setLastChangedTick to half a cycle later so that operations
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 

From 9ad5fa2f9175be1f2254bc2a0d7b92764b71d96f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 5 Sep 2022 14:27:49 -0700
Subject: [PATCH 154/287] Fixing finding work in coalesce engine.

---
 src/accl/graph/sega/coalesce_engine.cc | 90 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh |  3 +-
 src/accl/graph/sega/mpu.cc             |  4 +-
 3 files changed, 54 insertions(+), 43 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 3ff867c274..7a52d29c98 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
+    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -79,6 +79,9 @@ CoalesceEngine::registerMPU(MPU* mpu)
 bool
 CoalesceEngine::done()
 {
+    bool push_none = needsPush.none();
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -885,41 +888,46 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int>
-CoalesceEngine::getOptimalBitVectorSlice()
+std::tuple<bool, int, Addr>
+CoalesceEngine::getOptimalPullAddr()
 {
-    bool hit_in_cache = false;
-    int slice_base = -1;
-
-    // int score = 0;
-    // int max_score_possible = 3 * numElementsPerLine;
-    for (int it = 0; it < MAX_BITVECTOR_SIZE; it += numElementsPerLine) {
-        // int current_score = 0;
+    int it = startSearchIndex;
+    int initial_search_index = startSearchIndex;
+    while (true) {
         uint32_t current_popcount = 0;
         for (int i = 0; i < numElementsPerLine; i++) {
             current_popcount += needsPush[it + i];
         }
-        if (current_popcount == 0) {
-            continue;
+        if (current_popcount != 0) {
+            Addr addr = getBlockAddrFromBitIndex(it);
+            int block_index = getBlockIndex(addr);
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                return std::make_tuple(true, it, addr);
+            // Otherwise if it is in memory
+            } else if (cacheBlocks[block_index].addr != addr) {
+                if (pendingVertexPullReads.find(addr) != 
+                                                pendingVertexPullReads.end()) {
+                    startSearchIndex = 
+                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+                    return std::make_tuple(true, it, addr);
+                }
+            }
         }
-        // current_score += current_popcount;
-        Addr addr = getBlockAddrFromBitIndex(it);
-        int block_index = getBlockIndex(addr);
-        // Idle state: valid && !pendingApply && !pendingWB
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid) &&
-            (cacheBlocks[block_index].busyMask == 0) &&
-            (!cacheBlocks[block_index].pendingApply) &&
-            (!cacheBlocks[block_index].pendingWB)) {
-            assert(!cacheBlocks[block_index].needsApply);
-            assert(!cacheBlocks[block_index].pendingData);
-            return std::make_tuple(true, it);
-        } else if (cacheBlocks[block_index].addr != addr) {
-            return std::make_tuple(false, it);
+        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+        if (it == initial_search_index) {
+            break;
         }
     }
-
-    return std::make_tuple(hit_in_cache, slice_base);
+    // return garbage
+    return std::make_tuple(false, -1, 0); 
 }
 
 void
@@ -927,10 +935,10 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
 {
     bool hit_in_cache;
     int slice_base;
-    std::tie(hit_in_cache, slice_base) = getOptimalBitVectorSlice();
+    Addr addr;
 
+    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
     if (slice_base != -1) {
-        Addr addr = getBlockAddrFromBitIndex(slice_base);
         int block_index = getBlockIndex(addr);
         if (hit_in_cache) {
             assert(cacheBlocks[block_index].valid);
@@ -958,10 +966,6 @@ CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
             onTheFlyReqs++;
 
             pendingVertexPullReads.insert(addr);
-            // TODO: Set a tracking structure so that nextMemoryReadEvent knows
-            // It does not have to read this address anymore. It can simply set
-            // a flag to true (maybe not even needed just look if the cache has a
-            // line allocated for it in the cacheBlocks).
         }
         numPullsReceived--;
     }
@@ -993,14 +997,18 @@ CoalesceEngine::recvMemRetry()
 void
 CoalesceEngine::recvVertexPull()
 {
+    bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
-    memoryFunctionQueue.emplace_back(
-        [this] (int slice_base, Tick schedule_tick) {
-        processNextVertexPull(slice_base, schedule_tick);
-    }, 0, curTick());
-    if ((!nextMemoryEvent.pending()) &&
-        (!nextMemoryEvent.scheduled())) {
-        schedule(nextMemoryEvent, nextCycle());
+
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
     }
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 641ed327bb..92c28ae11e 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -107,13 +107,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int> getOptimalBitVectorSlice();
+    std::tuple<bool, int, Addr> getOptimalPullAddr();
 
     std::unordered_set<Addr> pendingVertexPullReads;
 
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 7b1727587a..63aa474542 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -194,7 +194,9 @@ MPU::sendPacket(PacketPtr pkt)
 void
 MPU::recvDoneSignal()
 {
-    centeralController->recvDoneSignal();
+    if (done()) {
+        centeralController->recvDoneSignal();
+    }
 }
 
 bool

From d57d301f767ea1ed4268b6a6293d7c0c4ee040c5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 6 Sep 2022 14:21:37 -0700
Subject: [PATCH 155/287] Fixing choosing work in coalesce engine.

---
 src/accl/graph/sega/SConscript         |   2 -
 src/accl/graph/sega/coalesce_engine.cc | 247 ++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  18 +-
 src/accl/graph/sega/push_engine.cc     |   3 -
 4 files changed, 194 insertions(+), 76 deletions(-)

diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 42a8d84ad5..5d48b46fba 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -43,13 +43,11 @@ Source('wl_engine.cc')
 
 DebugFlag('ApplyUpdates')
 DebugFlag('BaseMemoryEngine')
-DebugFlag('BitVector')
 DebugFlag('CenteralController')
 DebugFlag('CacheBlockState')
 DebugFlag('CoalesceEngine')
 DebugFlag('PushEngine')
 DebugFlag('SEGAStructureSize')
-DebugFlag('TempFlag')
 DebugFlag('WLEngine')
 
 CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a52d29c98..cf0e2872f6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -33,7 +33,6 @@
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
 #include "debug/ApplyUpdates.hh"
-#include "debug/BitVector.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -80,7 +79,7 @@ bool
 CoalesceEngine::done()
 {
     bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n", 
+    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
                     __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
@@ -428,26 +427,23 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 (cacheBlocks[block_index].valid)));
         // We have read the address to send the wl and it is not in the
         // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
-        DPRINTF(CoalesceEngine, "%s: Received read response for retry "
-                    "for addr %lu. It was not found in the cache.\n",
-                    __func__, addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
         WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            if (needsPush[it + i] == 1) {
-                _workCount--;
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
-                break;
             }
         }
-        DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                            __func__, needsPush.count());
-
         pendingVertexPullReads.erase(addr);
         delete pkt;
         return true;
@@ -720,6 +716,7 @@ CoalesceEngine::processNextApplyEvent()
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
+                        activeBits.push_back(bit_index_base + index);
                         if (!owner->running()) {
                             owner->start();
                         }
@@ -888,19 +885,78 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-std::tuple<bool, int, Addr>
+// std::tuple<bool, int, Addr>
+// CoalesceEngine::getOptimalPullAddr()
+// {
+//     int it = startSearchIndex;
+//     int initial_search_index = startSearchIndex;
+//     while (true) {
+//         uint32_t current_popcount = 0;
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             current_popcount += needsPush[it + i];
+//         }
+//         if (current_popcount != 0) {
+//             Addr addr = getBlockAddrFromBitIndex(it);
+//             int block_index = getBlockIndex(addr);
+//             // Only if it is in cache and it is in idle state.
+//             if ((cacheBlocks[block_index].addr == addr) &&
+//                 (cacheBlocks[block_index].valid) &&
+//                 (cacheBlocks[block_index].busyMask == 0) &&
+//                 (!cacheBlocks[block_index].pendingApply) &&
+//                 (!cacheBlocks[block_index].pendingWB)) {
+//                 assert(!cacheBlocks[block_index].needsApply);
+//                 assert(!cacheBlocks[block_index].pendingData);
+//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                 return std::make_tuple(true, it, addr);
+//             // Otherwise if it is in memory
+//             } else if (cacheBlocks[block_index].addr != addr) {
+//                 if (pendingVertexPullReads.find(addr) !=
+//                             pendingVertexPullReads.end()) {
+//                     startSearchIndex =
+//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//                     return std::make_tuple(true, it, addr);
+//                 }
+//             }
+//         }
+//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
+//         if (it == initial_search_index) {
+//             break;
+//         }
+//     }
+//     // return garbage
+//     return std::make_tuple(false, -1, 0);
+// }
+
+std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
-    int it = startSearchIndex;
-    int initial_search_index = startSearchIndex;
-    while (true) {
-        uint32_t current_popcount = 0;
-        for (int i = 0; i < numElementsPerLine; i++) {
-            current_popcount += needsPush[it + i];
-        }
-        if (current_popcount != 0) {
-            Addr addr = getBlockAddrFromBitIndex(it);
-            int block_index = getBlockIndex(addr);
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                BitStatus::PENDING_READ, addr, index_offset);
+            /*
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask = 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            */
+        } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid) &&
@@ -909,67 +965,122 @@ CoalesceEngine::getOptimalPullAddr()
                 (!cacheBlocks[block_index].pendingWB)) {
                 assert(!cacheBlocks[block_index].needsApply);
                 assert(!cacheBlocks[block_index].pendingData);
-                startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                return std::make_tuple(true, it, addr);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if (cacheBlocks[block_index].addr != addr) {
-                if (pendingVertexPullReads.find(addr) != 
-                                                pendingVertexPullReads.end()) {
-                    startSearchIndex = 
-                                    (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-                    return std::make_tuple(true, it, addr);
-                }
+                activeBits.pop_front();
+                return std::make_tuple(
+                            BitStatus::IN_MEMORY, addr, index_offset);
             }
         }
-        it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-        if (it == initial_search_index) {
-            break;
-        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
     }
-    // return garbage
-    return std::make_tuple(false, -1, 0); 
+
+    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
+// void
+// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+// {
+//     bool hit_in_cache;
+//     int slice_base;
+//     Addr addr;
+
+//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
+//     if (slice_base != -1) {
+//         int block_index = getBlockIndex(addr);
+//         if (hit_in_cache) {
+//             assert(cacheBlocks[block_index].valid);
+//             assert(cacheBlocks[block_index].busyMask == 0);
+
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//             for (int i = 0; i < numElementsPerLine; i++) {
+//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
+//                 if (needsPush[slice_base + i] == 1) {
+//                     _workCount--;
+//                     needsPush[slice_base + i] = 0;
+//                     owner->recvVertexPush(vertex_addr,
+//                                             cacheBlocks[block_index].items[i]);
+//                     break;
+//                 }
+//             }
+//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
+//                                     __func__, needsPush.count());
+//         } else {
+//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+//             SenderState* sender_state = new SenderState(true);
+//             pkt->pushSenderState(sender_state);
+//             memPort.sendPacket(pkt);
+//             onTheFlyReqs++;
+//             pendingVertexPullReads.insert(addr);
+//         }
+//         numPullsReceived--;
+//     }
+
+//     if (numPullsReceived > 0) {
+//         memoryFunctionQueue.emplace_back(
+//             [this] (int slice_base, Tick schedule_tick) {
+//             processNextVertexPull(slice_base, schedule_tick);
+//         }, 0, curTick());
+//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+//                                     "0 to memoryFunctionQueue.\n", __func__);
+//     }
+// }
+
 void
-CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    bool hit_in_cache;
-    int slice_base;
-    Addr addr;
-
-    std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-    if (slice_base != -1) {
-        int block_index = getBlockIndex(addr);
-        if (hit_in_cache) {
-            assert(cacheBlocks[block_index].valid);
-            assert(cacheBlocks[block_index].busyMask == 0);
-
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-            for (int i = 0; i < numElementsPerLine; i++) {
-                Addr vertex_addr = addr + i * sizeof(WorkListItem);
-                if (needsPush[slice_base + i] == 1) {
-                    _workCount--;
-                    needsPush[slice_base + i] = 0;
-                    owner->recvVertexPush(vertex_addr,
-                                            cacheBlocks[block_index].items[i]);
-                    break;
-                }
-            }
-            DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-                                    __func__, needsPush.count());
-        } else {
+    BitStatus bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != BitStatus::GARBAGE) {
+        if (bit_status == BitStatus::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+        }
+        if (bit_status == BitStatus::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+            owner->recvVertexPush(
+                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+        }
+        if (bit_status == BitStatus::IN_MEMORY) {
+            Addr addr = location;
+            int index_offset = offset;
+            uint64_t send_mask = (1 << index_offset);
+            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
             SenderState* sender_state = new SenderState(true);
             pkt->pushSenderState(sender_state);
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
-
-            pendingVertexPullReads.insert(addr);
+            pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
     }
-
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 92c28ae11e..fe7c83afb2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -42,6 +42,14 @@
 namespace gem5
 {
 
+enum BitStatus
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -107,22 +115,26 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
+    // CLEAN: Replace with slice_base_queue
     int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<bool, int, Addr> getOptimalPullAddr();
+    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
-    std::unordered_set<Addr> pendingVertexPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int slice_base, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 0134133cfa..505d41b0b8 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -31,7 +31,6 @@
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
-#include "debug/TempFlag.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -129,8 +128,6 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
                         peerMemoryAtomSize, addr, (uint32_t) wl.prop);
     numPendingPulls--;
-    DPRINTF(TempFlag, "%s: Received {addr: %lu, wl: %s}.\n",
-                            __func__, addr, wl.to_string());
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }

From 8d4f9b0e2bb82986db1d367e03cc6be48140d55c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 18:36:52 -0700
Subject: [PATCH 156/287] Adding support for synthetic traffic

---
 configs/accl/sega.py | 125 +++++++++++++++++++++++++++++++++++++++----
 1 file changed, 116 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 3fa5b99b3a..8e901b6e6d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -1,8 +1,35 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
 import m5
+import os
 import argparse
+import subprocess
 
 from math import log
-import math
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
@@ -103,21 +130,101 @@ def __init__(self,
 
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_mpus", type=int)
+    argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph_path", type=str)
+    argparser.add_argument("vertex_cache_line_size", type=int)
+    argparser.add_argument("synthetic", type=bool)
+    argparser.add_argument("--scale", type=int)
+    argparser.add_argument("--deg", type=int)
+    argparser.add_argument("--graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+
     args = argparser.parse_args()
 
-    return args.num_mpus, args.cache_size, \
-            args.graph_path, args.init_addr, args.init_value
+    if args.synthetic:
+        if (args.scale is None) or (args.deg is None):
+            raise ValueError("If synthetic is true, you should specify the"
+                        "scale of the graph by --scale [scale] and the average"
+                        "degree of the graph by --deg [average degree].")
+    else:
+        if args.graph is None:
+            raise ValueError("If synthetic is false, you should specify the "
+                        "path to graph binaries by --graph [path to graph].")
+    return args
 
 if __name__ == "__m5_main__":
-    num_mpus, cache_size, graph_path, first_addr, first_value = get_inputs()
-
-    print(f"Creating a system with {num_mpus} mpu(s) and graph {graph_path}")
-    system = SEGA(num_mpus, cache_size, graph_path, first_addr, first_value)
+    input_args = get_inputs()
+
+    image_path = None
+    if input_args.synthetic:
+        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_reader = os.environ.get("GRAPH_READER")
+        graph_sorter = os.environ.get("GRAPH_SORTER")
+        if graph_gen is None:
+            raise ValueError(f"No value for $GRAPH_GEN.")
+        if graph_reader is None:
+            raise ValueError(f"No value for $GRAPH_READER.")
+        if graph_sorter is None:
+            raise ValueError(f"No value for $GRAPH_SORTER")
+
+        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
+        if not os.path.exists(graph_path):
+            print(f"{graph_path} does not exist already.")
+            os.mkdir(graph_path)
+            print(f"Created {graph_path}")
+
+        if not "graph.txt" in os.listdir(graph_path):
+            print(f"graph.txt not found in {graph_path}")
+            subprocess.run([f"{graph_gen}",
+                            f"{input_args.scale}",
+                            f"{input_args.deg}",
+                            f"{graph_path}/graph_unordered.txt"])
+            print(f"Generated a graph with scale "
+                f"{input_args.scale} and deg {input_args.deg}")
+            subprocess.run(["python",
+                            f"{graph_sorter}",
+                            f"{graph_path}/graph_unordered.txt",
+                            f"{graph_path}/graph.txt"])
+            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                    f" and saved in {graph_path}/graph.txt")
+            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+            print(f"Deleted {graph_path}/graph_unordered.txt")
+
+        if not "binaries" in os.listdir(graph_path):
+            print(f"binaries directory not found in {graph_path}")
+            os.mkdir(f"{graph_path}/binaries")
+            print(f"Created {graph_path}/binaries")
+
+        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
+            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
+            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
+
+        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
+        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
+            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
+                os.remove(delete.path)
+            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
+            subprocess.run([f"{graph_reader}" ,
+                            f"{graph_path}/graph.txt",
+                            "false",
+                            f"{input_args.num_gpts}",
+                            f"{input_args.vertex_cache_line_size}",
+                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
+            print(f"Created the graph binaries in "
+                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
+    else:
+        image_path = input_args.graph
+
+    system = SEGA(input_args.num_gpts,
+                input_args.cache_size,
+                image_path,
+                input_args.init_addr,
+                input_args.init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 7ddb4cf48879fca09694b983c46ae486bbf97bc2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 27 Jul 2022 23:42:01 -0700
Subject: [PATCH 157/287] Adding workload as a parameter

---
 configs/accl/sega.py               |  2 +-
 src/accl/graph/sega/PushEngine.py  |  2 ++
 src/accl/graph/sega/WLEngine.py    |  2 ++
 src/accl/graph/sega/push_engine.cc | 17 ++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  3 ++-
 src/accl/graph/sega/wl_engine.cc   | 19 +++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh   |  5 +++++
 7 files changed, 45 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8e901b6e6d..ddeae34e4e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -215,7 +215,7 @@ def get_inputs():
                             f"{input_args.vertex_cache_line_size}",
                             f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
             print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/n{input_args.num_gpts}")
+                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
         image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
     else:
         image_path = input_args.graph
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index f98f22ba9d..ad9ddfefcf 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -41,3 +41,5 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 52ca031260..a44352ab9b 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -40,3 +40,5 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.") # 4 is arbitrary
+
+    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 505d41b0b8..9f13c00397 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,6 +42,7 @@ PushEngine::PushEngine(const Params& params):
     _running(false),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPushEvent([this] { processNextPushEvent(); }, name()),
@@ -85,6 +86,20 @@ PushEngine::done()
             edgePointerQueue.empty();
 }
 
+
+uint32_t
+PushEngine::propagate(uint32_t value, uint32_t weight)
+{
+    uint32_t update;
+    if (workload == "BFS")  {
+        update = value + 1;
+    }
+    else{
+        panic("The workload %s is not supported", workload);
+    }
+    return update;
+}
+
 void
 PushEngine::start()
 {
@@ -239,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = curr_edge.value + 1;
+    uint32_t update_value = propagate(value, 1);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6f92b62be0..a64a5b1f5b 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -82,7 +82,6 @@ class PushEngine : public BaseMemoryEngine
         Addr src() { return _src; }
         uint32_t value() { return _value; }
     };
-
     struct PushInfo {
         Addr src;
         uint32_t value;
@@ -103,6 +102,8 @@ class PushEngine : public BaseMemoryEngine
     int edgeQueueSize;
     std::deque<std::deque<CompleteEdge>> edgeQueue;
 
+    std::string workload;
+    uint32_t propagate(uint32_t value, uint32_t weight);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9890eeed76..855e36b413 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,6 +41,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
+    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -58,6 +59,18 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
+uint32_t
+WLEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -138,7 +151,8 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                std::min(update_value, registerFile[update_addr]);
+                    reduce(update_value, registerFile[update_addr]);
+                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -191,7 +205,8 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    std::min(update_value, workListFile[addr].tempProp);
+                    reduce(update_value, workListFile[addr].tempProp);
+                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 4a0489b123..b03a3cdb87 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,6 +47,8 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
+
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
@@ -55,6 +57,9 @@ class WLEngine : public BaseReduceEngine
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From 302bc6e3e6be79a515890427c50b765a463441b1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 7 Sep 2022 13:22:40 -0700
Subject: [PATCH 158/287] Adding workload as a parameter to coalesce engine.

---
 src/accl/graph/sega/CoalesceEngine.py  |   5 ++
 src/accl/graph/sega/coalesce_engine.cc | 120 ++++---------------------
 src/accl/graph/sega/coalesce_engine.hh |   5 +-
 src/accl/graph/sega/push_engine.cc     |   2 +-
 src/accl/graph/sega/wl_engine.cc       |   2 -
 5 files changed, 28 insertions(+), 106 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 2cc756ff3f..f6e997f1e3 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -35,7 +35,12 @@ class CoalesceEngine(BaseMemoryEngine):
     cxx_class = 'gem5::CoalesceEngine'
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
     num_mshr_entry = Param.Int("Number of MSHR entries.")
+
     num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+
+    workload = Param.String("BFS", "Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index cf0e2872f6..a80d629737 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,7 +49,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),  startSearchIndex(0),
+    _workCount(0), numPullsReceived(0), workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -85,6 +85,18 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
+uint32_t
+CoalesceEngine::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
+
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
@@ -700,8 +712,12 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = std::min(current_prop,
-                            cacheBlocks[block_index].items[index].tempProp);
+            // NOTE: It might be the case that for workloads other than BFS,
+            // the reduce function here should be different to the reduce
+            // function defined in WLEngine. Think about the case of PR in
+            // detail.
+            uint32_t new_prop = reduce(
+                cacheBlocks[block_index].items[index].tempProp, current_prop);
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
@@ -885,48 +901,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
-// std::tuple<bool, int, Addr>
-// CoalesceEngine::getOptimalPullAddr()
-// {
-//     int it = startSearchIndex;
-//     int initial_search_index = startSearchIndex;
-//     while (true) {
-//         uint32_t current_popcount = 0;
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             current_popcount += needsPush[it + i];
-//         }
-//         if (current_popcount != 0) {
-//             Addr addr = getBlockAddrFromBitIndex(it);
-//             int block_index = getBlockIndex(addr);
-//             // Only if it is in cache and it is in idle state.
-//             if ((cacheBlocks[block_index].addr == addr) &&
-//                 (cacheBlocks[block_index].valid) &&
-//                 (cacheBlocks[block_index].busyMask == 0) &&
-//                 (!cacheBlocks[block_index].pendingApply) &&
-//                 (!cacheBlocks[block_index].pendingWB)) {
-//                 assert(!cacheBlocks[block_index].needsApply);
-//                 assert(!cacheBlocks[block_index].pendingData);
-//                 startSearchIndex = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                 return std::make_tuple(true, it, addr);
-//             // Otherwise if it is in memory
-//             } else if (cacheBlocks[block_index].addr != addr) {
-//                 if (pendingVertexPullReads.find(addr) !=
-//                             pendingVertexPullReads.end()) {
-//                     startSearchIndex =
-//                                 (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//                     return std::make_tuple(true, it, addr);
-//                 }
-//             }
-//         }
-//         it = (it + numElementsPerLine) % MAX_BITVECTOR_SIZE;
-//         if (it == initial_search_index) {
-//             break;
-//         }
-//     }
-//     // return garbage
-//     return std::make_tuple(false, -1, 0);
-// }
-
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -949,13 +923,6 @@ CoalesceEngine::getOptimalPullAddr()
             activeBits.pop_front();
             return std::make_tuple(
                                 BitStatus::PENDING_READ, addr, index_offset);
-            /*
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask = 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            */
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -983,55 +950,6 @@ CoalesceEngine::getOptimalPullAddr()
     return std::make_tuple(BitStatus::GARBAGE, 0, 0);
 }
 
-// void
-// CoalesceEngine::processNextVertexPull(int prev_slice_base, Tick schedule_tick)
-// {
-//     bool hit_in_cache;
-//     int slice_base;
-//     Addr addr;
-
-//     std::tie(hit_in_cache, slice_base, addr) = getOptimalPullAddr();
-//     if (slice_base != -1) {
-//         int block_index = getBlockIndex(addr);
-//         if (hit_in_cache) {
-//             assert(cacheBlocks[block_index].valid);
-//             assert(cacheBlocks[block_index].busyMask == 0);
-
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//             for (int i = 0; i < numElementsPerLine; i++) {
-//                 Addr vertex_addr = addr + i * sizeof(WorkListItem);
-//                 if (needsPush[slice_base + i] == 1) {
-//                     _workCount--;
-//                     needsPush[slice_base + i] = 0;
-//                     owner->recvVertexPush(vertex_addr,
-//                                             cacheBlocks[block_index].items[i]);
-//                     break;
-//                 }
-//             }
-//             DPRINTF(BitVector, "%s: needsPush.count: %d.\n",
-//                                     __func__, needsPush.count());
-//         } else {
-//             PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-//             SenderState* sender_state = new SenderState(true);
-//             pkt->pushSenderState(sender_state);
-//             memPort.sendPacket(pkt);
-//             onTheFlyReqs++;
-//             pendingVertexPullReads.insert(addr);
-//         }
-//         numPullsReceived--;
-//     }
-
-//     if (numPullsReceived > 0) {
-//         memoryFunctionQueue.emplace_back(
-//             [this] (int slice_base, Tick schedule_tick) {
-//             processNextVertexPull(slice_base, schedule_tick);
-//         }, 0, curTick());
-//         DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-//                                     "0 to memoryFunctionQueue.\n", __func__);
-//     }
-// }
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index fe7c83afb2..7503d69b76 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -115,8 +115,6 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int _workCount;
     int numPullsReceived;
-    // CLEAN: Replace with slice_base_queue
-    int startSearchIndex;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
@@ -130,6 +128,9 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
+    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9f13c00397..625f836561 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -254,7 +254,7 @@ PushEngine::processNextPushEvent()
                     __func__, curr_edge.to_string());
 
     // TODO: Implement propagate function here
-    uint32_t update_value = propagate(value, 1);
+    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
     PacketPtr update = createUpdatePacket<uint32_t>(
                             curr_edge.dst, update_value);
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 855e36b413..5465769cff 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,7 +152,6 @@ WLEngine::processNextReadEvent()
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
                     reduce(update_value, registerFile[update_addr]);
-                // std::min(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -206,7 +205,6 @@ WLEngine::processNextReduceEvent()
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
                     reduce(update_value, workListFile[addr].tempProp);
-                    // std::min(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;

From ab2362a81cfec8311e017d824c9d6208beec235d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 8 Sep 2022 10:20:48 -0700
Subject: [PATCH 159/287] Adding stats.

---
 configs/accl/sega.py                   |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 21 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 +++-
 src/accl/graph/sega/push_engine.cc     |  7 ++++++-
 src/accl/graph/sega/push_engine.hh     |  2 ++
 src/accl/graph/sega/wl_engine.cc       |  9 ++++++++-
 src/accl/graph/sega/wl_engine.hh       |  1 +
 7 files changed, 35 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ddeae34e4e..e8d76e7dad 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -159,7 +159,7 @@ def get_inputs():
     image_path = None
     if input_args.synthetic:
         base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.path.abspath(os.environ.get("GRAPH_GEN"))
+        graph_gen = os.environ.get("GRAPH_GEN")
         graph_reader = os.environ.get("GRAPH_READER")
         graph_sorter = os.environ.get("GRAPH_SORTER")
         if graph_gen is None:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a80d629737..dbe5e56f2d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -210,7 +210,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                         "cacheBlocks[%d]. Rejecting request.\n",
                                         __func__, block_index);
-            stats.readRejections++;
+            stats.mshrTargetShortage++;
             return false;
         } else {
             DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
@@ -241,7 +241,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                 "Rejecting request.\n", __func__);
                 // TODO: Break out read rejections into more than one stat
                 // based on the cause of the rejection
-                stats.readRejections++;
+                stats.mshrEntryShortage++;
                 return false;
             } else {
                 DPRINTF(CoalesceEngine,  "%s: MSHR "
@@ -399,7 +399,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 DPRINTF(CoalesceEngine,  "%s: Out of targets for "
                             "cacheBlocks[%d]. Rejecting request.\n",
                                             __func__, block_index);
-                stats.readRejections++;
+                stats.mshrTargetShortage++;
                 return false;
             }
             DPRINTF(CoalesceEngine, "%s: There is room for another target "
@@ -740,6 +740,8 @@ CoalesceEngine::processNextApplyEvent()
                 }
             }
         }
+        stats.bitvectorLength.sample(needsPush.count());
+
         cacheBlocks[block_index].needsWB = true;
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
@@ -1055,12 +1057,16 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(readRejections, statistics::units::Count::get(),
-             "Number of cache rejections."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries.")
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector")
 {
 }
 
@@ -1069,7 +1075,8 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(64);
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 7503d69b76..16c417fc60 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -158,10 +158,12 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHits;
       statistics::Scalar readMisses;
       statistics::Scalar readHitUnderMisses;
-      statistics::Scalar readRejections;
+      statistics::Scalar mshrEntryShortage;
+      statistics::Scalar mshrTargetShortage;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;
+      statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 625f836561..855d666989 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -300,7 +300,10 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates.")
+             "Number of sent updates."),
+    ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
+                                    statistics::units::Second>::get(),
+             "Traversed Edges Per Second.")
 {
 }
 
@@ -308,6 +311,8 @@ void
 PushEngine::PushStats::regStats()
 {
     using namespace statistics;
+
+    TEPS = numUpdates / simSeconds;
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a64a5b1f5b..a5677067b8 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,8 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+
+      statistics::Formula TEPS;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5465769cff..a39905037e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -144,6 +144,10 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
             }
+        } else {
+            DPRINTF(WLEngine, "%s: There are no free registers "
+                    "available in the registerFile.\n", __func__);
+            stats.registerShortage++;
         }
     } else {
         // TODO: Generalize this to reduce function rather than just min
@@ -231,7 +235,10 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(numReduce, statistics::units::Count::get(),
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies")
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(registerShortage, statistics::units::Count::get(),
+             "Number of times updates were "
+             "stalled because of register shortage")
 {
 }
 
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b03a3cdb87..2956e58666 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -76,6 +76,7 @@ class WLEngine : public BaseReduceEngine
 
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
+      statistics::Scalar registerShortage;
     };
 
     WorkListStats stats;

From 40b01f05558c798a20e60b26822d9ca8241b47eb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 11 Sep 2022 14:39:42 -0700
Subject: [PATCH 160/287] Separating graph generation from run script.

---
 configs/accl/graph-gen.py | 103 ++++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py      |  96 +++--------------------------------
 2 files changed, 110 insertions(+), 89 deletions(-)
 create mode 100644 configs/accl/graph-gen.py

diff --git a/configs/accl/graph-gen.py b/configs/accl/graph-gen.py
new file mode 100644
index 0000000000..16985b3537
--- /dev/null
+++ b/configs/accl/graph-gen.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
+    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.scale, args.deg, args.num_gpts
+
+if __name__ == "__main__":
+    scale, deg, num_gpts = get_inputs()
+
+    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
+    graph_gen = os.environ.get("GRAPH_GEN")
+    graph_reader = os.environ.get("GRAPH_READER")
+    graph_sorter = os.environ.get("GRAPH_SORTER")
+    if graph_gen is None:
+        raise ValueError(f"No value for $GRAPH_GEN.")
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER")
+
+    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
+    if not os.path.exists(graph_path):
+        print(f"{graph_path} does not exist already.")
+        os.mkdir(graph_path)
+        print(f"Created {graph_path}")
+
+    if not "graph.txt" in os.listdir(graph_path):
+        print(f"graph.txt not found in {graph_path}")
+        for delete in os.scandir(graph_path):
+            os.remove(delete.path)
+        print(f"Deleted everything in {graph_path}")
+        subprocess.run([f"{graph_gen}",
+                        f"{scale}",
+                        f"{deg}",
+                        f"{graph_path}/graph_unordered.txt"])
+        print(f"Generated a graph with scale "
+            f"{scale} and deg {deg}")
+        subprocess.run(["python",
+                        f"{graph_sorter}",
+                        f"{graph_path}/graph_unordered.txt",
+                        f"{graph_path}/graph.txt"])
+        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
+                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
+        print(f"Deleted {graph_path}/graph_unordered.txt")
+
+    if not "binaries" in os.listdir(graph_path):
+        print(f"binaries directory not found in {graph_path}")
+        os.mkdir(f"{graph_path}/binaries")
+        print(f"Created {graph_path}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
+        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}/graph.txt",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_path}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_path}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e8d76e7dad..10f7ea2b48 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -25,9 +25,7 @@
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
 import m5
-import os
 import argparse
-import subprocess
 
 from math import log
 from m5.objects import *
@@ -49,7 +47,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=32,
+        self.wl_engine = WLEngine(update_queue_size=64,
                                 register_file_size=32)
         self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
                                             cache_size=cache_size,
@@ -132,99 +130,19 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("vertex_cache_line_size", type=int)
-    argparser.add_argument("synthetic", type=bool)
-    argparser.add_argument("--scale", type=int)
-    argparser.add_argument("--deg", type=int)
-    argparser.add_argument("--graph", type=str)
+    argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
 
     args = argparser.parse_args()
 
-    if args.synthetic:
-        if (args.scale is None) or (args.deg is None):
-            raise ValueError("If synthetic is true, you should specify the"
-                        "scale of the graph by --scale [scale] and the average"
-                        "degree of the graph by --deg [average degree].")
-    else:
-        if args.graph is None:
-            raise ValueError("If synthetic is false, you should specify the "
-                        "path to graph binaries by --graph [path to graph].")
-    return args
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
 
 if __name__ == "__m5_main__":
-    input_args = get_inputs()
-
-    image_path = None
-    if input_args.synthetic:
-        base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-        graph_gen = os.environ.get("GRAPH_GEN")
-        graph_reader = os.environ.get("GRAPH_READER")
-        graph_sorter = os.environ.get("GRAPH_SORTER")
-        if graph_gen is None:
-            raise ValueError(f"No value for $GRAPH_GEN.")
-        if graph_reader is None:
-            raise ValueError(f"No value for $GRAPH_READER.")
-        if graph_sorter is None:
-            raise ValueError(f"No value for $GRAPH_SORTER")
-
-        graph_path = os.path.join(base_dir, f"graph_{input_args.scale}_{input_args.deg}")
-        if not os.path.exists(graph_path):
-            print(f"{graph_path} does not exist already.")
-            os.mkdir(graph_path)
-            print(f"Created {graph_path}")
-
-        if not "graph.txt" in os.listdir(graph_path):
-            print(f"graph.txt not found in {graph_path}")
-            subprocess.run([f"{graph_gen}",
-                            f"{input_args.scale}",
-                            f"{input_args.deg}",
-                            f"{graph_path}/graph_unordered.txt"])
-            print(f"Generated a graph with scale "
-                f"{input_args.scale} and deg {input_args.deg}")
-            subprocess.run(["python",
-                            f"{graph_sorter}",
-                            f"{graph_path}/graph_unordered.txt",
-                            f"{graph_path}/graph.txt"])
-            print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                    f" and saved in {graph_path}/graph.txt")
-            subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-            print(f"Deleted {graph_path}/graph_unordered.txt")
-
-        if not "binaries" in os.listdir(graph_path):
-            print(f"binaries directory not found in {graph_path}")
-            os.mkdir(f"{graph_path}/binaries")
-            print(f"Created {graph_path}/binaries")
-
-        if not f"gpts_{input_args.num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-            print(f"gpts_{input_args.num_gpts} not found in {graph_path}/binaries")
-            os.mkdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-            print(f"Created {graph_path}/binaries/gpts_{input_args.num_gpts}")
-
-        expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(input_args.num_gpts)]
-        if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}") for binary in expected_bins]):
-            print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            for delete in os.scandir(f"{graph_path}/binaries/gpts_{input_args.num_gpts}"):
-                os.remove(delete.path)
-            print(f"Deleted all the files in {graph_path}/binaries/gpts_{input_args.num_gpts}")
-            subprocess.run([f"{graph_reader}" ,
-                            f"{graph_path}/graph.txt",
-                            "false",
-                            f"{input_args.num_gpts}",
-                            f"{input_args.vertex_cache_line_size}",
-                            f"{graph_path}/binaries/gpts_{input_args.num_gpts}"])
-            print(f"Created the graph binaries in "
-                    f"{graph_path}/binaries/gpts_{input_args.num_gpts}")
-        image_path = f"{graph_path}/binaries/gpts_{input_args.num_gpts}"
-    else:
-        image_path = input_args.graph
-
-    system = SEGA(input_args.num_gpts,
-                input_args.cache_size,
-                image_path,
-                input_args.init_addr,
-                input_args.init_value)
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()

From 6124b008976c8797d0b330815f9b04579abf42ce Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 12 Sep 2022 15:25:11 -0700
Subject: [PATCH 161/287] Adding new stats.

---
 src/accl/graph/sega/coalesce_engine.cc | 13 ++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dbe5e56f2d..7646ba8862 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -834,9 +834,13 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                                     peerMemoryAtomSize);
     DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
             "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-
     memPort.sendPacket(pkt);
     onTheFlyReqs++;
+
+    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+        pendingVertexPullReads.end()) {
+        stats.numDoubleMemReads++;
+    }
 }
 
 void
@@ -1000,6 +1004,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
+    } else {
+        stats.workSearchFails++;
     }
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
@@ -1061,6 +1067,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(workSearchFails, statistics::units::Count::get(),
+             "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 16c417fc60..355eaad07d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -160,6 +160,8 @@ class CoalesceEngine : public BaseMemoryEngine
       statistics::Scalar readHitUnderMisses;
       statistics::Scalar mshrEntryShortage;
       statistics::Scalar mshrTargetShortage;
+      statistics::Scalar workSearchFails;
+      statistics::Scalar numDoubleMemReads;
 
       statistics::Formula hitRate;
       statistics::Histogram mshrEntryLength;

From 655902315cc2a07658100ebbdc568cb59523ef85 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 13 Sep 2022 21:44:54 -0700
Subject: [PATCH 162/287] Fixing sconscript style.

---
 src/accl/graph/base/SConscript |  6 ++---
 src/accl/graph/sega/SConscript | 44 +++++++++++++++++-----------------
 2 files changed, 25 insertions(+), 25 deletions(-)

diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 0e43d1aed8..8b741abfc8 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -25,8 +25,8 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseReduceEngine.py')
+SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
-Source('base_reduce_engine.cc')
+Source("base_reduce_engine.cc")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d48b46fba..f16d025ca2 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -25,30 +25,30 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-Import('*')
+Import("*")
 
-SimObject('BaseMemoryEngine.py')
-SimObject('CenteralController.py')
-SimObject('CoalesceEngine.py')
-SimObject("MPU.py")
-SimObject('PushEngine.py')
-SimObject('WLEngine.py')
+SimObject("BaseMemoryEngine.py", sim_objects=["BaseMemoryEngine"])
+SimObject("CenteralController.py", sim_objects=["CenteralController"])
+SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
+SimObject("MPU.py", sim_objects=["MPU"])
+SimObject("PushEngine.py", sim_objects=["PushEngine"])
+SimObject("WLEngine.py", sim_objects=["WLEngine"])
 
-Source('base_memory_engine.cc')
-Source('centeral_controller.cc')
-Source('coalesce_engine.cc')
+Source("base_memory_engine.cc")
+Source("centeral_controller.cc")
+Source("coalesce_engine.cc")
 Source("mpu.cc")
-Source('push_engine.cc')
-Source('wl_engine.cc')
+Source("push_engine.cc")
+Source("wl_engine.cc")
 
-DebugFlag('ApplyUpdates')
-DebugFlag('BaseMemoryEngine')
-DebugFlag('CenteralController')
-DebugFlag('CacheBlockState')
-DebugFlag('CoalesceEngine')
-DebugFlag('PushEngine')
-DebugFlag('SEGAStructureSize')
-DebugFlag('WLEngine')
+DebugFlag("ApplyUpdates")
+DebugFlag("BaseMemoryEngine")
+DebugFlag("CenteralController")
+DebugFlag("CacheBlockState")
+DebugFlag("CoalesceEngine")
+DebugFlag("PushEngine")
+DebugFlag("SEGAStructureSize")
+DebugFlag("WLEngine")
 
-CompoundFlag('MPU', ['CoalesceEngine', 'PushEngine',
-                    'WLEngine', 'BaseMemoryEngine'])
\ No newline at end of file
+CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
+                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file

From 489e914deb132f3b81cd0b31ff0254226aa08db9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 15 Sep 2022 11:16:25 -0700
Subject: [PATCH 163/287] Adding stats for measuring push and pull rate.

---
 configs/accl/sega.py                   | 21 ++++++++-----
 src/accl/graph/sega/coalesce_engine.cc | 34 ++++++++++++++++++++-
 src/accl/graph/sega/coalesce_engine.hh | 41 ++++++++++++++++----------
 3 files changed, 72 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 10f7ea2b48..2a92ee1769 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False))
+                                    range=AddrRange(edge_memory_size),
+                                    in_addr_map=False))
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -92,7 +92,8 @@ def __init__(self,
                 cache_size,
                 graph_path,
                 first_addr,
-                first_value):
+                first_value
+                ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -103,16 +104,20 @@ def __init__(self,
         self.interconnect = NoncoherentXBar(frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
-                                            width=64)
+                                            width=64
+                                            )
 
         self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                    image_file=f"{graph_path}/vertices")
+                                       image_file=f"{graph_path}/vertices"
+                                        )
+
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                            AddrRange(start=0, size="4GiB"),
-                            num_mpus,
-                            32)
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7646ba8862..5f1e849660 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -454,6 +454,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 needsPush[it + i] = 0;
                 _workCount--;
                 owner->recvVertexPush(vertex_addr, items[i]);
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
@@ -990,6 +992,8 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             _workCount--;
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
             Addr addr = location;
@@ -1037,6 +1041,8 @@ CoalesceEngine::recvVertexPull()
     bool should_schedule = (numPullsReceived == 0);
     numPullsReceived++;
 
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
     if (should_schedule) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1052,7 +1058,7 @@ CoalesceEngine::recvVertexPull()
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
-
+    lastResetTick(0),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
@@ -1072,8 +1078,22 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
@@ -1091,6 +1111,18 @@ CoalesceEngine::CoalesceStats::regStats()
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
 }
 
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 355eaad07d..8190478a1b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -147,25 +147,36 @@ class CoalesceEngine : public BaseMemoryEngine
 
     struct CoalesceStats : public statistics::Group
     {
-      CoalesceStats(CoalesceEngine &coalesce);
+        CoalesceStats(CoalesceEngine &coalesce);
 
-      void regStats() override;
+        virtual void regStats() override;
 
-      CoalesceEngine &coalesce;
+        virtual void resetStats() override;
 
-      statistics::Scalar numVertexReads;
-      statistics::Scalar numVertexWrites;
-      statistics::Scalar readHits;
-      statistics::Scalar readMisses;
-      statistics::Scalar readHitUnderMisses;
-      statistics::Scalar mshrEntryShortage;
-      statistics::Scalar mshrTargetShortage;
-      statistics::Scalar workSearchFails;
-      statistics::Scalar numDoubleMemReads;
+        CoalesceEngine &coalesce;
 
-      statistics::Formula hitRate;
-      statistics::Histogram mshrEntryLength;
-      statistics::Histogram bitvectorLength;
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar workSearchFails;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
     };
 
     CoalesceStats stats;

From b297c794e5c08daa6be9727b554687507594a034 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 16 Sep 2022 14:18:57 -0700
Subject: [PATCH 164/287] Added FinalAnswer debugFlag and answer printing.

---
 configs/accl/sega.py                       |  8 ++--
 src/accl/graph/sega/CenteralController.py  |  4 +-
 src/accl/graph/sega/SConscript             |  2 +-
 src/accl/graph/sega/base_memory_engine.hh  |  2 +-
 src/accl/graph/sega/centeral_controller.cc | 43 ++++++++++++++++++----
 src/accl/graph/sega/centeral_controller.hh |  7 ++--
 src/accl/graph/sega/coalesce_engine.cc     | 36 ++++++++++++++----
 src/accl/graph/sega/coalesce_engine.hh     |  2 +
 src/accl/graph/sega/push_engine.hh         |  2 +
 9 files changed, 82 insertions(+), 24 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2a92ee1769..7b37742cdb 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -107,9 +107,11 @@ def __init__(self,
                                             width=64
                                             )
 
-        self.ctrl = CenteralController(addr=first_addr, value=first_value,
-                                       image_file=f"{graph_path}/vertices"
-                                        )
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
 
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6f6b12ea2c..9bee76511d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -39,6 +39,6 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    addr = Param.Addr("The addr for the initial update")
-    value = Param.Int("The value for the initial update")
+    init_addr = Param.Addr("The addr for the initial update")
+    init_value = Param.Int("The value for the initial update")
     image_file = Param.String("Path to the global memory image.")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index f16d025ca2..5d411be9ac 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -41,11 +41,11 @@ Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
 
-DebugFlag("ApplyUpdates")
 DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
+DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
 DebugFlag("WLEngine")
diff --git a/src/accl/graph/sega/base_memory_engine.hh b/src/accl/graph/sega/base_memory_engine.hh
index f336edcbf1..afe7fd0433 100644
--- a/src/accl/graph/sega/base_memory_engine.hh
+++ b/src/accl/graph/sega/base_memory_engine.hh
@@ -108,7 +108,7 @@ class BaseMemoryEngine : public ClockedObject
 
     AddrRangeList getAddrRanges() { return memPort.getAddrRanges(); }
 
-    void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+    virtual void recvFunctional(PacketPtr pkt) = 0;
 
     virtual void init() override;
 };
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 5ce7228abb..c6de1d8390 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,8 +28,6 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
-#include <iostream>
-
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -44,8 +42,7 @@ CenteralController::CenteralController
     ClockedObject(params),
     system(params.system),
     reqPort(name() + ".req_port", this),
-    addr(params.addr),
-    value(params.value)
+    maxVertexAddr(0)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -66,9 +63,9 @@ CenteralController::getPort(const std::string &if_name, PortID idx)
 void
 CenteralController::initState()
 {
-    ClockedObject::initState();
+    // ClockedObject::initState();
 
-    const auto &file = params().image_file;
+    const auto& file = params().image_file;
     if (file == "")
         return;
 
@@ -77,6 +74,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
+    maxVertexAddr = image.maxAddr();
     PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
                     system->cacheLineSize());
 
@@ -86,7 +84,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    PacketPtr first_update = createUpdatePacket<uint32_t>(addr, value);
+    Addr initial_addr = params().init_addr;
+    uint32_t initial_value = params().init_value;
+    PacketPtr first_update =
+                createUpdatePacket<uint32_t>(initial_addr, initial_value);
 
     if (!reqPort.blocked()) {
         reqPort.sendPacket(first_update);
@@ -111,6 +112,21 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     return pkt;
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC((Addr) 0);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 void
 CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 {
@@ -160,6 +176,19 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
+        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
+            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+            reqPort.sendFunctional(pkt);
+
+            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+            WorkListItem items[num_items];
+            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+
+            for (int i = 0; i < num_items; i++) {
+                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
+                                __func__, addr, i, items[i].to_string());
+            }
+        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index c54c4c04ef..bd272cf30d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -67,12 +68,12 @@ class CenteralController : public ClockedObject
     System* system;
     ReqPort reqPort;
 
-    Addr addr;
-    uint32_t value;
-
+    Addr maxVertexAddr;
     std::vector<MPU*> mpuVector;
+
     template<typename T> PacketPtr
                               createUpdatePacket(Addr addr, T value);
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     void functionalAccess(PacketPtr pkt);
 
   public:
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5f1e849660..59d9720148 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -32,7 +32,6 @@
 
 #include "accl/graph/sega/mpu.hh"
 #include "base/intmath.hh"
-#include "debug/ApplyUpdates.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
 #include "debug/SEGAStructureSize.hh"
@@ -75,12 +74,38 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        memPort.sendFunctional(pkt);
+    }
+}
+
 bool
 CoalesceEngine::done()
 {
-    bool push_none = needsPush.none();
-    DPRINTF(CoalesceEngine, "%s: needsPush.none: %s.\n",
-                    __func__, push_none ? "true" : "false");
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -723,9 +748,6 @@ CoalesceEngine::processNextApplyEvent()
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
-                DPRINTF(ApplyUpdates, "%s: WorkListItem[%lu][%d]: %s.\n",
-                    __func__, cacheBlocks[block_index].addr, index,
-                    cacheBlocks[block_index].items[index].to_string());
 
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8190478a1b..bb6fd9d1ea 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -190,6 +190,8 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt);
+
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index a5677067b8..b317992b2d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -142,6 +142,8 @@ class PushEngine : public BaseMemoryEngine
     PushEngine(const Params& params);
     void registerMPU(MPU* mpu);
 
+    virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
+
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);

From 16216bc2bf3dee723fa35eccd478412e47bfe738 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 18 Sep 2022 17:17:24 -0700
Subject: [PATCH 165/287] Adding stats to measure vertexReadLatency.

---
 src/accl/graph/sega/coalesce_engine.cc |  5 ++++-
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/wl_engine.cc       | 14 ++++++++++++--
 src/accl/graph/sega/wl_engine.hh       |  5 +++--
 4 files changed, 20 insertions(+), 5 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 59d9720148..d4102a8bca 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -810,6 +810,7 @@ void
 CoalesceEngine::processNextMemoryEvent()
 {
     if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
         nextMemoryEvent.sleep();
         return;
     }
@@ -1097,6 +1098,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by target shortage."),
     ADD_STAT(workSearchFails, statistics::units::Count::get(),
              "Number of times coalesce engine fails to find work to push."),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
              "Number of times a memory block has been read twice. "
              "Once for push and once to populate the cache."),
@@ -1147,4 +1150,4 @@ CoalesceEngine::CoalesceStats::resetStats()
     lastResetTick = curTick();
 }
 
-}
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index bb6fd9d1ea..967d83a531 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
         statistics::Scalar workSearchFails;
+        statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a39905037e..b16d827dbe 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -143,6 +143,7 @@ WLEngine::processNextReadEvent()
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
                 owner->checkRetryReq();
+                vertexReadTime[update_addr] = curTick();
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -189,6 +190,11 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                                     wl.to_string(), workListFile.size());
+
+    stats.vertexReadLatency.sample(
+        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+    vertexReadTime.erase(addr);
+
     assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
@@ -238,7 +244,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of memory blocks read for vertecies"),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
-             "stalled because of register shortage")
+             "stalled because of register shortage"),
+    ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
+             "Histogram of the latency of reading a vertex.")
 {
 }
 
@@ -246,6 +254,8 @@ void
 WLEngine::WorkListStats::regStats()
 {
     using namespace statistics;
-}
 
+    vertexReadLatency.init(64);
 }
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2956e58666..0c6361825e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -47,13 +47,12 @@ class WLEngine : public BaseReduceEngine
   private:
     MPU* owner;
 
-
-
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
+    std::unordered_map<Addr, Tick> vertexReadTime;
 
     std::unordered_map<Addr, WorkListItem> workListFile;
 
@@ -77,6 +76,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+
+      statistics::Histogram vertexReadLatency;
     };
 
     WorkListStats stats;

From 3e6216c8976155517cb9edb2874ca7c890b56255 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 19 Sep 2022 11:56:05 -0700
Subject: [PATCH 166/287] Adding a config script with simple memory

---
 configs/accl/sega-simple.py | 177 ++++++++++++++++++++++++++++++++++++
 configs/accl/sega.py        |  48 ++++++----
 2 files changed, 206 insertions(+), 19 deletions(-)
 create mode 100644 configs/accl/sega-simple.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
new file mode 100644
index 0000000000..ae537e76ca
--- /dev/null
+++ b/configs/accl/sega-simple.py
@@ -0,0 +1,177 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="75ns",
+                                        latency_var="0ns",
+                                        bandwidth="19.2GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(
+                self,
+                num_mpus,
+                cache_size,
+                graph_path,
+                first_addr,
+                first_value
+                ):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '1GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
+                                            forward_latency=1,
+                                            response_latency=1,
+                                            width=64
+                                            )
+
+        self.ctrl = CenteralController(
+                                    init_addr=first_addr,
+                                    init_value=first_value,
+                                    image_file=f"{graph_path}/vertices"
+                                    )
+
+        self.ctrl.req_port = self.interconnect.cpu_side_ports
+
+        vertex_ranges = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            num_mpus,
+                                            32
+                                            )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("8GiB", cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setReqPort(self.interconnect.cpu_side_ports)
+            gpt.setRespPort(self.interconnect.mem_side_ports)
+            gpts.append(gpt)
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7b37742cdb..8c30d10dec 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,29 +47,39 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64,
-                                register_file_size=32)
-        self.coalesce_engine = CoalesceEngine(attached_memory_atom_size=32,
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
                                             cache_size=cache_size,
                                             num_mshr_entry=32,
                                             num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4)
-        self.push_engine = PushEngine(push_req_queue_size=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64)
+                                    resp_queue_size=64
+                                    )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
 
         self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                    range=AddrRange(edge_memory_size),
-                                    in_addr_map=False))
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
-        self.mpu = MPU(wl_engine=self.wl_engine,
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine)
+                    push_engine=self.push_engine
+                    )
 
     def getRespPort(self):
         return self.mpu.in_port
@@ -87,7 +97,8 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(self,
+    def __init__(
+                self,
                 num_mpus,
                 cache_size,
                 graph_path,
@@ -101,25 +112,24 @@ def __init__(self,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(frontend_latency=1,
+        self.interconnect = NoncoherentXBar(
+                                            frontend_latency=1,
                                             forward_latency=1,
                                             response_latency=1,
                                             width=64
                                             )
 
         self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
+                                    addr=first_addr, value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
-
         self.ctrl.req_port = self.interconnect.cpu_side_ports
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):

From e1d8a934fdbb80520c46e18a136a271ac676d255 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 19 Sep 2022 20:27:40 -0700
Subject: [PATCH 167/287] Adding stats to count the result of bitvector search.

---
 src/accl/graph/sega/coalesce_engine.cc | 12 +++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  3 ++-
 src/accl/graph/sega/push_engine.cc     |  3 +++
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  2 +-
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d4102a8bca..b870345d57 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1031,9 +1031,10 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
         }
         numPullsReceived--;
-    } else {
-        stats.workSearchFails++;
     }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
     if (numPullsReceived > 0) {
         memoryFunctionQueue.emplace_back(
             [this] (int slice_base, Tick schedule_tick) {
@@ -1096,8 +1097,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
-    ADD_STAT(workSearchFails, statistics::units::Count::get(),
-             "Number of times coalesce engine fails to find work to push."),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1111,6 +1110,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1122,7 +1123,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector")
+             "Histogram of the length of the bitvector.")
 {
 }
 
@@ -1133,6 +1134,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
+    bitvectorSearchStatus.init(4);
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 967d83a531..2b7b17d196 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,7 +164,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
-        statistics::Scalar workSearchFails;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -172,6 +171,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
 
+        statistics::Vector bitvectorSearchStatus;
+
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 855d666989..a56283cbf6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -243,6 +243,7 @@ void
 PushEngine::processNextPushEvent()
 {
     if (owner->blocked()) {
+        stats.numNetBlocks++;
         nextPushEvent.sleep();
         return;
     }
@@ -301,6 +302,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     push(_push),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
              "Number of sent updates."),
+    ADD_STAT(numNetBlocks, statistics::units::Count::get(),
+             "Number of updates blocked by network."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index b317992b2d..801d8e567d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -127,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
       PushEngine &push;
 
       statistics::Scalar numUpdates;
+      statistics::Scalar numNetBlocks;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index b16d827dbe..c6e8fda523 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -192,7 +192,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                                     wl.to_string(), workListFile.size());
 
     stats.vertexReadLatency.sample(
-        (curTick() - vertexReadTime[addr]) / getClockFrequency());
+        ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
     vertexReadTime.erase(addr);
 
     assert(!workListFile.empty());

From a0a0fbeaa85a09ee2545adfaedfc251de483b6fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 12:21:46 -0700
Subject: [PATCH 168/287] Adding a stat to count number of idle cycles.

---
 src/accl/graph/sega/push_engine.cc | 6 +++++-
 src/accl/graph/sega/push_engine.hh | 6 +++---
 src/accl/graph/sega/wl_engine.hh   | 1 +
 3 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a56283cbf6..5029013acd 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -28,7 +28,6 @@
 
 #include "accl/graph/sega/push_engine.hh"
 
-#include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
@@ -40,6 +39,7 @@ namespace gem5
 PushEngine::PushEngine(const Params& params):
     BaseMemoryEngine(params),
     _running(false),
+    lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     workload(params.workload),
@@ -107,6 +107,7 @@ PushEngine::start()
     assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
+    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace()) {
@@ -123,6 +124,7 @@ PushEngine::processNextVertexPullEvent()
 
     if (!workLeft()) {
         _running = false;
+        lastIdleEntranceTick = curTick();
     }
 
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -304,6 +306,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of sent updates."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
+    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+             "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second.")
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 801d8e567d..1f139d061e 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -37,7 +37,6 @@
 namespace gem5
 {
 
-class CoalesceEngine;
 class MPU;
 
 class PushEngine : public BaseMemoryEngine
@@ -88,10 +87,10 @@ class PushEngine : public BaseMemoryEngine
         Addr offset;
         int numElements;
     };
+    MPU* owner;
 
     bool _running;
-    int numElementsPerLine;
-    MPU* owner;
+    Tick lastIdleEntranceTick;
 
     int numPendingPulls;
     int edgePointerQueueSize;
@@ -128,6 +127,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Scalar numUpdates;
       statistics::Scalar numNetBlocks;
+      statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
     };
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0c6361825e..3d527df3cf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -77,6 +77,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
+
       statistics::Histogram vertexReadLatency;
     };
 

From efcc6d2b35a03dcaa078f5c95d91ef6028c7805b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 22 Sep 2022 17:32:46 -0700
Subject: [PATCH 169/287] Adding stats to measure queueing latencies.

---
 configs/accl/sega.py                   |  3 ++-
 src/accl/graph/base/data_structs.hh    |  6 ++++--
 src/accl/graph/sega/coalesce_engine.cc | 17 +++++++++++++++--
 src/accl/graph/sega/coalesce_engine.hh |  5 ++++-
 src/accl/graph/sega/push_engine.cc     | 25 +++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh     | 12 +++++++++---
 src/accl/graph/sega/wl_engine.cc       | 12 +++++++++---
 src/accl/graph/sega/wl_engine.hh       |  4 ++--
 8 files changed, 64 insertions(+), 20 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 8c30d10dec..a67551a5fd 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -120,7 +120,8 @@ def __init__(
                                             )
 
         self.ctrl = CenteralController(
-                                    addr=first_addr, value=first_value,
+                                    init_addr=first_addr,
+                                    init_value=first_value,
                                     image_file=f"{graph_path}/vertices"
                                     )
         self.ctrl.req_port = self.interconnect.cpu_side_ports
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 026a3cb7b2..a46aaf2de9 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,8 +96,10 @@ struct CompleteEdge {
     uint32_t weight;
     uint32_t value;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
-        src(src), dst(dst), weight(weight), value(value)
+    uint64_t entrance;
+
+    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b870345d57..62cae01613 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -826,6 +826,8 @@ CoalesceEngine::processNextMemoryEvent()
         next_memory_function_tick) = memoryFunctionQueue.front();
     next_memory_function(next_memory_function_input, next_memory_function_tick);
     memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
     DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
                                 "memoryFunctionQueue.size = %d.\n", __func__,
                                 memoryFunctionQueue.size());
@@ -929,6 +931,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
+        stats.numInvalidMemFunctions++;
     }
 }
 
@@ -1110,6 +1113,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
@@ -1123,7 +1128,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector.")
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
 {
 }
 
@@ -1134,7 +1141,11 @@ CoalesceEngine::CoalesceStats::regStats()
 
     mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
-    bitvectorSearchStatus.init(4);
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
 
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
@@ -1142,6 +1153,8 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    memoryFunctionLatency.init(64);
 }
 
 void
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 2b7b17d196..262f75fbcf 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -47,7 +47,8 @@ enum BitStatus
     PENDING_READ,
     IN_CACHE,
     IN_MEMORY,
-    GARBAGE
+    GARBAGE,
+    NUM_STATUS
 };
 
 class MPU;
@@ -170,6 +171,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidMemFunctions;
 
         statistics::Vector bitvectorSearchStatus;
 
@@ -179,6 +181,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram memoryFunctionLatency;
     };
 
     CoalesceStats stats;
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5029013acd..af1c904eda 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -142,8 +142,10 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(start_addr, end_addr, sizeof(Edge),
-                        peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(
+                            start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr,
+                            (uint32_t) wl.prop, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -182,6 +184,9 @@ PushEngine::processNextMemoryReadEvent()
 
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
+            stats.edgePointerQueueLatency.sample(
+                                (curTick() - curr_info.entrance()) *
+                                1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
@@ -224,8 +229,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(push_info.src, edge_dst,
-                    edge_weight, push_info.value);
+        edges.emplace_back(
+            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
     }
     edgeQueue.push_back(edges);
     onTheFlyMemReqs--;
@@ -267,7 +272,8 @@ PushEngine::processNextPushEvent()
                         "with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
 
-
+    stats.edgeQueueLatency.sample(
+        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
     edge_list.pop_front();
     if (edge_list.empty()) {
         edgeQueue.pop_front();
@@ -310,7 +316,11 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of cycles PushEngine has been idle."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
-             "Traversed Edges Per Second.")
+             "Traversed Edges Per Second."),
+    ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of the edgeQueue.")
 {
 }
 
@@ -320,6 +330,9 @@ PushEngine::PushStats::regStats()
     using namespace statistics;
 
     TEPS = numUpdates / simSeconds;
+
+    edgePointerQueueLatency.init(64);
+    edgeQueueLatency.init(64);
 }
 
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1f139d061e..5d2277eb5a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -52,11 +52,12 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
+        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                            size_t atom, Addr src, uint32_t value):
-                            _start(start), _end(end), _step(step),
-                            _atom(atom), _src(src), _value(value)
+                        size_t atom, Addr src, uint32_t value, Tick entrance):
+                        _start(start), _end(end), _step(step), _atom(atom),
+                        _src(src), _value(value), _entrance(entrance)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -80,6 +81,8 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
+
+        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -130,6 +133,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Scalar numIdleCycles;
 
       statistics::Formula TEPS;
+
+      statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgeQueueLatency;
     };
 
     PushStats stats;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index c6e8fda523..5d4dd1723e 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -79,7 +79,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
         return false;
     }
 
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>());
+    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
     DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
                 "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
                 __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
@@ -105,7 +105,8 @@ WLEngine::processNextReadEvent()
 {
     Addr update_addr;
     uint32_t update_value;
-    std::tie(update_addr, update_value) = updateQueue.front();
+    Tick enter_tick;
+    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
 
     DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -134,6 +135,7 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
+                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -162,6 +164,7 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
+        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -246,7 +249,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "Number of times updates were "
              "stalled because of register shortage"),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
-             "Histogram of the latency of reading a vertex.")
+             "Histogram of the latency of reading a vertex (ns)."),
+    ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
+             "Histogram of the latency of dequeuing an update (ns).")
 {
 }
 
@@ -256,6 +261,7 @@ WLEngine::WorkListStats::regStats()
     using namespace statistics;
 
     vertexReadLatency.init(64);
+    updateQueueLatency.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 3d527df3cf..f888979be9 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -48,7 +48,7 @@ class WLEngine : public BaseReduceEngine
     MPU* owner;
 
     int updateQueueSize;
-    std::deque<std::tuple<Addr, uint32_t>> updateQueue;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
@@ -77,8 +77,8 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
 
-
       statistics::Histogram vertexReadLatency;
+      statistics::Histogram updateQueueLatency;
     };
 
     WorkListStats stats;

From baa1dcb8df2e4d09a05ed6b97fd1b36c24f92e74 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 10:52:35 -0700
Subject: [PATCH 170/287] Added pybindmethod to createInitialUpdate. merge
 added.

---
 configs/accl/sega-simple.py                |   8 +-
 configs/accl/sega-single-simple.py         | 151 ++++++++++++++++++++
 configs/accl/sega-single.py                | 155 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   8 +-
 src/accl/graph/sega/MPU.py                 |   1 +
 src/accl/graph/sega/base_memory_engine.cc  |  20 +--
 src/accl/graph/sega/centeral_controller.cc | 131 +++++------------
 src/accl/graph/sega/centeral_controller.hh |  39 ++----
 src/accl/graph/sega/coalesce_engine.cc     |  27 ++++
 src/base/addr_range.hh                     |  31 +++++
 10 files changed, 430 insertions(+), 141 deletions(-)
 create mode 100644 configs/accl/sega-single-simple.py
 create mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index ae537e76ca..e0a4fcc89e 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -65,15 +65,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="0ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s"
+                                        bandwidth="0GB/s"
                                         )
 
         self.edge_mem_ctrl = SimpleMemory(
-                                        latency="75ns",
+                                        latency="30ns",
                                         latency_var="0ns",
-                                        bandwidth="19.2GB/s",
+                                        bandwidth="32GB/s",
                                         range=AddrRange(edge_memory_size),
                                         in_addr_map=False
                                         )
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
new file mode 100644
index 0000000000..a87e6c53bb
--- /dev/null
+++ b/configs/accl/sega-single-simple.py
@@ -0,0 +1,151 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="0GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="32GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_vertex_image(self, vertex_image):
+        self.vertex_mem_ctrl.image_file = vertex_image
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_range(AddrRange("4GiB"))
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
new file mode 100644
index 0000000000..d9fe11a781
--- /dev/null
+++ b/configs/accl/sega-single.py
@@ -0,0 +1,155 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=64,
+                                register_file_size=32
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=32,
+                                            num_tgts_per_mshr=32,
+                                            max_resp_per_cycle=4
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                            latency="30ns",
+                                            latency_var="0ns",
+                                            bandwidth="32GiB/s"
+                                        )
+
+        self.edge_mem_ctrl = MemCtrl(
+                                    dram=DDR4_2400_8x8(
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                    )
+                                )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.mpu.in_port
+    def setRespPort(self, port):
+        self.mpu.in_port = port
+
+    def getReqPort(self):
+        return self.mpu.out_port
+    def setReqPort(self, port):
+        self.mpu.out_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        plain_vertex_range = AddrRange("4GiB")
+        self._vertex_ranges = interleave_addresses(
+                                            plain_vertex_range,
+                                            1,
+                                            32
+                                            )
+
+        gpts = [GPT("8GiB", cache_size)]
+        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
+        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
+        gpts[0].setReqPort(gpts[0].getRespPort())
+        self.gpts = gpts
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.cache_size, args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9bee76511d..0721ff977c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.ClockedObject import ClockedObject
 
 class CenteralController(ClockedObject):
@@ -35,10 +36,9 @@ class CenteralController(ClockedObject):
     cxx_class = 'gem5::CenteralController'
 
     system = Param.System(Parent.any, "System this Engine is a part of")
-    req_port  = RequestPort("Port to send updates to the outside")
+
+    image_file = Param.String("Path to the vertex image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    init_addr = Param.Addr("The addr for the initial update")
-    init_value = Param.Int("The value for the initial update")
-    image_file = Param.String("Path to the global memory image.")
+    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 2d65be2949..d80142b21e 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -45,3 +45,4 @@ class MPU(SimObject):
                                 "each instance of MPU object.")
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
+
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index 9bd1941b23..d9864664b1 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -59,14 +59,18 @@ void
 BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
-    // BaseMemoryEngine only supports one memory.
-    assert(memory_ranges.size() == 1);
-
-    peerMemoryRange = memory_ranges.front();
-    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is %s. "
-                            "The range is %s interleaved.\n", __func__,
-                            peerMemoryRange.to_string(),
-                            peerMemoryRange.interleaved() ? "" : "not");
+
+    if (memory_ranges.size() == 2) {
+        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
+    } else if (memory_ranges.size() == 1) {
+        peerMemoryRange = memory_ranges.front();
+    } else {
+        panic("Received an unacceptable number of ranges from memory.");
+    }
+    DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
+            "%s. The range is %s interleaved.\n", __func__,
+            peerMemoryRange.to_string(),
+            peerMemoryRange.interleaved() ? "" : "not");
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6de1d8390..68b88e9e77 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -37,12 +37,9 @@
 namespace gem5
 {
 
-CenteralController::CenteralController
-                    (const CenteralControllerParams &params):
+CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system),
-    reqPort(name() + ".req_port", this),
-    maxVertexAddr(0)
+    system(params.system)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,33 +47,35 @@ CenteralController::CenteralController
     }
 }
 
-Port&
-CenteralController::getPort(const std::string &if_name, PortID idx)
-{
-    if (if_name == "req_port") {
-        return reqPort;
-    } else {
-        return SimObject::getPort(if_name, idx);
-    }
-}
-
 void
 CenteralController::initState()
 {
-    // ClockedObject::initState();
-
+    for (auto mpu: mpuVector) {
+        addrRangeListMap[mpu] = mpu->getAddrRanges();
+    }
     const auto& file = params().image_file;
     if (file == "")
         return;
 
-    auto *object = loader::createObjectFile(file, true);
+    auto* object = loader::createObjectFile(file, true);
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
-    PortProxy proxy([this](PacketPtr pkt) { functionalAccess(pkt); },
-                    system->cacheLineSize());
+    Addr maxVertexAddr = image.maxAddr();
+
+    PortProxy proxy(
+    [this](PacketPtr pkt) {
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(pkt->getAddr())) {
+                    mpu->recvFunctional(pkt);
+                    break;
+                }
+            }
+        }
+    }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 }
@@ -84,21 +83,24 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    Addr initial_addr = params().init_addr;
-    uint32_t initial_value = params().init_value;
-    PacketPtr first_update =
-                createUpdatePacket<uint32_t>(initial_addr, initial_value);
-
-    if (!reqPort.blocked()) {
-        reqPort.sendPacket(first_update);
+    while(!initialUpdates.empty()) {
+        PacketPtr front = initialUpdates.front();
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            for (auto range: range_list) {
+                if (range.contains(front->getAddr())) {
+                    mpu->handleIncomingUpdate(front);
+                }
+            }
+        }
+        initialUpdates.pop_front();
     }
 }
 
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), addr, value);
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
     // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
     // bits
     req->setPC(((Addr) value) << 2);
@@ -106,65 +108,17 @@ CenteralController::createUpdatePacket(Addr addr, T value)
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
 
-PacketPtr
-CenteralController::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC((Addr) 0);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
+    pkt->setLE<T>(value);
 
     return pkt;
 }
 
 void
-CenteralController::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(_blocked, "Should never try to send if blocked MemSide!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-        _blocked = true;
-    }
-}
-
-bool
-CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-CenteralController::ReqPort::recvReqRetry()
-{
-    panic_if(!(_blocked && blockedPacket), "Received retry without a blockedPacket");
-
-    _blocked = false;
-    sendPacket(blockedPacket);
-
-    if (!_blocked) {
-        blockedPacket = nullptr;
-    }
-}
-
-void
-CenteralController::functionalAccess(PacketPtr pkt)
+CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 {
-    DPRINTF(CenteralController,
-                "%s: Functional access for pkt->addr: %lu, pkt->size: %lu.\n",
-                __func__, pkt->getAddr(), pkt->getSize());
-    reqPort.sendFunctional(pkt);
+    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
+    initialUpdates.push_back(update);
 }
 
 void
@@ -176,19 +130,6 @@ CenteralController::recvDoneSignal()
     }
 
     if (done) {
-        for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize()) {
-            PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
-            reqPort.sendFunctional(pkt);
-
-            int num_items = system->cacheLineSize() / sizeof(WorkListItem);
-            WorkListItem items[num_items];
-            pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
-
-            for (int i = 0; i < num_items; i++) {
-                DPRINTF(FinalAnswer, "%s: WorkListItem[%lu][%d]: %s.\n",
-                                __func__, addr, i, items[i].to_string());
-            }
-        }
         exitSimLoopNow("no update left to process.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index bd272cf30d..4a4e9c7cb1 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
@@ -44,46 +45,24 @@ namespace gem5
 class CenteralController : public ClockedObject
 {
   private:
-    class ReqPort : public RequestPort
-    {
-      private:
-        CenteralController* owner;
-        bool _blocked;
-        PacketPtr blockedPacket;
-
-      public:
-        ReqPort(const std::string& name, CenteralController* owner) :
-          RequestPort(name, owner), owner(owner),
-          _blocked(false), blockedPacket(nullptr)
-        {}
-
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return _blocked; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
-    ReqPort reqPort;
 
     Addr maxVertexAddr;
+    std::deque<PacketPtr> initialUpdates;
+
     std::vector<MPU*> mpuVector;
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
-    template<typename T> PacketPtr
-                              createUpdatePacket(Addr addr, T value);
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    void functionalAccess(PacketPtr pkt);
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-    Port& getPort(const std::string &if_name,
-                PortID idx=InvalidPortID) override;
-    virtual void initState();
-    virtual void startup();
 
+    virtual void initState() override;
+    virtual void startup() override;
+
+    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 62cae01613..ac62254fd6 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,6 +127,15 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -136,6 +145,15 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
+    // bool found = false;
+    // Addr trimmed_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(addr)) {
+    //         trimmed_addr = range.removeIntlvBits(addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -147,7 +165,16 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
+    // Addr upgraded_addr;
+    // for (auto range: peerMemoryRanges) {
+    //     if (range.contains(trimmed_addr)) {
+    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
+    //         found = true;
+    //     }
+    // }
+    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 07bd255d26..a4bf581224 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -48,6 +48,7 @@
 
 #include "base/bitfield.hh"
 #include "base/cprintf.hh"
+#include "base/intmath.hh"
 #include "base/logging.hh"
 #include "base/types.hh"
 
@@ -732,6 +733,36 @@ class AddrRange
     {
         return !(*this == r);
     }
+
+    friend AddrRange
+    merge(const AddrRange& left, const AddrRange& right)
+    {
+        assert(left.interleaved());
+        assert(right.interleaved());
+        assert(left.mergesWith(right));
+
+        int bits_org = left.masks.size();
+        int bits_new = bits_org - 1;
+
+        int left_match = left.intlvMatch;
+        int right_match = right.intlvMatch;
+        assert(std::abs(left_match - right_match) == (1 << bits_new));
+
+        Addr last_mask = left.masks[left.masks.size() - 1];
+        int xor_high_bit_org = 0;
+        int xor_high_bit_new = 0;
+        if (!isPowerOf2(last_mask)) {
+            xor_high_bit_org = ceilLog2<Addr>(last_mask);
+            xor_high_bit_new = xor_high_bit_org - 2;
+        }
+        int intlv_high_bit_org =
+                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
+        int intlv_high_bit_new = intlv_high_bit_org - 2;
+
+        int match = std::min(left_match, right_match);
+        return AddrRange(left._start, left._end, intlv_high_bit_new,
+                            xor_high_bit_new, bits_new, match);
+    }
 };
 
 static inline AddrRangeList

From a0461dea5bdcbf67dd89752790902f5e68e070fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 12:07:38 -0700
Subject: [PATCH 171/287] Adding stat to measure response latency.

---
 configs/accl/sega-simple.py            |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 24 +++++++++++++++++++-----
 src/accl/graph/sega/coalesce_engine.hh |  2 ++
 3 files changed, 22 insertions(+), 6 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index e0a4fcc89e..fffc273ee1 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -113,7 +113,7 @@ def __init__(
                 ):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '4GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index ac62254fd6..43d352da30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -274,6 +274,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
         return true;
     } else {
         // miss
@@ -618,9 +619,16 @@ CoalesceEngine::processNextResponseEvent()
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
-        if ((num_responses_sent >= maxRespPerCycle) ||
-            (responseQueue.empty())) {
-                break;
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
         }
     }
 
@@ -1127,6 +1135,9 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache rejections caused by entry shortage."),
     ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
     ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
@@ -1156,6 +1167,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
              "Histogram of the latency of processing a memory function.")
 {
@@ -1166,8 +1179,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
     bitvectorSearchStatus.init(NUM_STATUS);
     bitvectorSearchStatus.subname(0, "PENDING_READ");
     bitvectorSearchStatus.subname(1, "IN_CACHE");
@@ -1181,6 +1192,9 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
 
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 262f75fbcf..705285ba23 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -165,6 +165,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
         statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
@@ -181,6 +182,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
 

From fbbd888e40e6e23b61331aee037a1ebc1a71e695 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 26 Sep 2022 17:01:13 -0700
Subject: [PATCH 172/287] Adding stats to count model inaccuracies.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++++++--
 src/accl/graph/sega/coalesce_engine.hh | 3 ++-
 2 files changed, 9 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 43d352da30..0a4a041176 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -828,6 +828,8 @@ CoalesceEngine::processNextApplyEvent()
         }
         DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
     }
 
     applyQueue.pop_front();
@@ -966,7 +968,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidMemFunctions++;
+        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1151,7 +1153,10 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidMemFunctions, statistics::units::Count::get(),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
     ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
              "Distribution for the location of vertex searches."),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 705285ba23..b1f5b1fea1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -172,7 +172,8 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidMemFunctions;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
 
         statistics::Vector bitvectorSearchStatus;
 

From 411bfa11be14dda13cc38351c2efeab4737503da Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 15:11:01 -0700
Subject: [PATCH 173/287] style fix.

---
 src/accl/graph/sega/push_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index af1c904eda..6ff1f77c45 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -335,4 +335,4 @@ PushEngine::PushStats::regStats()
     edgeQueueLatency.init(64);
 }
 
-}
+} // namespace gem5

From bf9bed1ca66b949bba7d03001f34fb6ed30c97b2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 12:37:13 -0700
Subject: [PATCH 174/287] Adding multiple queues and ports in pushEngine

---
 src/accl/graph/base/data_structs.hh | 24 +++++++-
 src/accl/graph/sega/MPU.py          |  8 ++-
 src/accl/graph/sega/mpu.cc          | 90 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/mpu.hh          | 15 ++++-
 src/accl/graph/sega/push_engine.cc  | 12 +++-
 src/accl/graph/sega/push_engine.hh  |  2 +-
 6 files changed, 137 insertions(+), 14 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index a46aaf2de9..d3db3edda5 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -90,7 +90,7 @@ struct __attribute__ ((packed)) Edge
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
 
-struct CompleteEdge {
+struct MetaEdge {
     uint64_t src;
     uint64_t dst;
     uint32_t weight;
@@ -98,17 +98,35 @@ struct CompleteEdge {
 
     uint64_t entrance;
 
-    CompleteEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
         src(src), dst(dst), weight(weight), value(value), entrance(entrance)
     {}
 
     std::string to_string()
     {
-        return csprintf("CompleteEdge{src: %lu, dst:%lu, weight: %u}",
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
                                                     src, dst, weight);
     }
 };
 
+struct Update {
+    uint64_t src;
+    uint64_t dst;
+    uint32_t value;
+
+    Update(): src(0), dst(0), value(0)
+    {}
+    Update(uint64_t src, uint64_t dst, uint32_t value):
+        src(src), dst(dst), value(value)
+    {}
+
+    std::string to_string()
+    {
+        return csprintf("Update{src: %lu, dst:%lu, value: %u}",
+                                                src, dst, value);
+    }
+};
+
 template<typename T>
 class UniqueFIFO
 {
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index d80142b21e..1ea6a868a9 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,9 +27,9 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.SimObject import SimObject
+from m5.objects.ClockedObject import ClockedObject
 
-class MPU(SimObject):
+class MPU(ClockedObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
@@ -39,6 +39,8 @@ class MPU(SimObject):
     in_port = ResponsePort("Port to receive updates from outside")
     out_port  = RequestPort("Port to send updates to the outside")
 
+    out_ports = VectorRequestPort("Ports to remote MPUs ")
+
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -46,3 +48,5 @@ class MPU(SimObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
+    update_queue_size = Param.Int(16, "Maximum number of entries "
+                                    "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 63aa474542..8897e5a959 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,23 +29,32 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
 namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    SimObject(params),
+    ClockedObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this)
+    outPort(name() + ".outPort", this),
+    updateQueueSize(params.update_queue_size),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
+
+
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+
+        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+    }
 }
 
 Port&
@@ -55,8 +64,10 @@ MPU::getPort(const std::string& if_name, PortID idx)
         return inPort;
     } else if (if_name == "out_port") {
         return outPort;
+    } else if (if_name == "outPorts") {
+        return outports[idx];
     } else {
-        return SimObject::getPort(if_name, idx);
+        return ClockedObject::getPort(if_name, idx);
     }
 }
 
@@ -166,6 +177,79 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+bool
+MPU::enqueueUpdate(Update update)
+{
+    // Creating the packet
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+
+    for (int i = 0; i < outports.size(); i++) {
+        AddrRangeList addrList = outports[i].getAddrRanges();
+        for (auto range : addrList) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[i].size() < updateQueueSize) {
+                    updateQueues[i].emplace_back(update, curTick());
+                    return true;
+                } else {
+                    return false;
+                }
+            }
+        }
+    }
+
+    panic("The update created does not match to any outport.");
+}
+
+template<typename T> PacketPtr
+MPU::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+MPU::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        if (outports[i].blocked()) {
+            continue;
+        }
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outports[i].sendPacket(pkt);
+        updateQueues[i].pop_front();
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index edf0350caf..d7042540f0 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -36,7 +36,7 @@
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
 #include "mem/port.hh"
-#include "sim/sim_object.hh"
+#include "sim/clocked_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -45,7 +45,7 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public SimObject
+class MPU : public ClockedObject
 {
   private:
     class RespPort : public ResponsePort
@@ -99,6 +99,16 @@ class MPU : public SimObject
 
     AddrRangeList localAddrRange;
 
+    uint32_t updateQueueSize;
+
+    std::vector<ReqPort> outports;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
   public:
     PARAMS(MPU);
     MPU(const Params& params);
@@ -115,6 +125,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 6ff1f77c45..4546ceee47 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -224,7 +224,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<CompleteEdge> edges;
+    std::deque<MetaEdge> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
@@ -255,8 +255,8 @@ PushEngine::processNextPushEvent()
         return;
     }
 
-    std::deque<CompleteEdge>& edge_list = edgeQueue.front();
-    CompleteEdge curr_edge = edge_list.front();
+    std::deque<MetaEdge>& edge_list = edgeQueue.front();
+    MetaEdge curr_edge = edge_list.front();
 
     DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                     __func__, curr_edge.to_string());
@@ -267,6 +267,12 @@ PushEngine::processNextPushEvent()
                             curr_edge.dst, update_value);
 
     owner->sendPacket(update);
+
+    Update update_2(curr_edge.src, curr_edge.dst, update_value);
+    (!owner->enqueueUpdate(update_2)) {
+        // edge_list.pop_front();
+        // edge_list.push_back(curr_edge);
+    }
     stats.numUpdates++;
     DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
                         "with value: %d.\n", __func__, curr_edge.src,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 5d2277eb5a..d6763e3ab7 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,7 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
-    std::deque<std::deque<CompleteEdge>> edgeQueue;
+    std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From 32c75813f3bff0af05a960ad8b40d2f731a9296d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 28 Sep 2022 13:20:32 -0700
Subject: [PATCH 175/287] Changing propagate function

---
 src/accl/graph/sega/PushEngine.py  |  7 ++-
 src/accl/graph/sega/push_engine.cc | 80 ++++++++++++------------------
 src/accl/graph/sega/push_engine.hh |  5 +-
 3 files changed, 41 insertions(+), 51 deletions(-)

diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index ad9ddfefcf..7dba86aff2 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -40,6 +40,9 @@ class PushEngine(BaseMemoryEngine):
     # significantly bigger than push_req_queue_size
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
-                                    "edges read from memory")
+                                    "edges read from memory.")
+    
+    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
+                                            "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload")
+    workload = Param.String("BFS", "Name of the workload.")
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4546ceee47..c82a4c88be 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -42,10 +42,11 @@ PushEngine::PushEngine(const Params& params):
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
-    nextPushEvent([this] { processNextPushEvent(); }, name()),
+    nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
     stats(*this)
 {}
 
@@ -55,16 +56,6 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-PushEngine::recvReqRetry()
-{
-    DPRINTF(PushEngine, "%s: Received a req retry.\n", __func__);
-    if (nextPushEvent.pending()) {
-        nextPushEvent.wake();
-        schedule(nextPushEvent, nextCycle());
-    }
-}
-
 bool
 PushEngine::vertexSpace()
 {
@@ -238,57 +229,52 @@ PushEngine::handleMemResp(PacketPtr pkt)
     delete pkt_data;
     delete pkt;
 
-    if ((!nextPushEvent.pending()) &&
-        (!nextPushEvent.scheduled())) {
-        schedule(nextPushEvent, nextCycle());
+    if (!nextPropagateEvent.scheduled()) {
+        schedule(nextPropagateEvent, nextCycle());
     }
     return true;
 }
 
 // TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
-PushEngine::processNextPushEvent()
+PushEngine::processNextPropagateEvent()
 {
-    if (owner->blocked()) {
-        stats.numNetBlocks++;
-        nextPushEvent.sleep();
-        return;
-    }
+    int num_propagates = 0;
+    while(true) {
+        std::deque<MetaEdge>& edge_list = edgeQueue.front();
+        MetaEdge curr_edge = edge_list.front();
 
-    std::deque<MetaEdge>& edge_list = edgeQueue.front();
-    MetaEdge curr_edge = edge_list.front();
+        DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
+                        __func__, curr_edge.to_string());
 
-    DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                    __func__, curr_edge.to_string());
+        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
 
-    // TODO: Implement propagate function here
-    uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
-    PacketPtr update = createUpdatePacket<uint32_t>(
-                            curr_edge.dst, update_value);
-
-    owner->sendPacket(update);
-
-    Update update_2(curr_edge.src, curr_edge.dst, update_value);
-    (!owner->enqueueUpdate(update_2)) {
-        // edge_list.pop_front();
-        // edge_list.push_back(curr_edge);
-    }
-    stats.numUpdates++;
-    DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to addr: %lu "
-                        "with value: %d.\n", __func__, curr_edge.src,
+        Update update(curr_edge.src, curr_edge.dst, update_value);
+        edge_list.pop_front();
+        if (owner->enqueueUpdate(update)) {
+            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
+                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
                         curr_edge.dst, update_value);
+            stats.numUpdates++;
+            stats.edgeQueueLatency.sample(
+            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+        } else {
+            edge_list.push_back(curr_edge);
+        }
 
-    stats.edgeQueueLatency.sample(
-        (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
-    edge_list.pop_front();
-    if (edge_list.empty()) {
-        edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
+        }
+
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
     }
 
-    assert(!nextPushEvent.pending());
-    assert(!nextPushEvent.scheduled());
+    assert(!nextPropagateEvent.scheduled());
     if (!edgeQueue.empty()) {
-        schedule(nextPushEvent, nextCycle());
+        schedule(nextPropagateEvent, nextCycle());
     }
 }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index d6763e3ab7..f3304a8e2a 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -102,6 +102,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
+    int maxPropagatesPerCycle;
     std::deque<std::deque<MetaEdge>> edgeQueue;
 
     std::string workload;
@@ -117,8 +118,8 @@ class PushEngine : public BaseMemoryEngine
     MemoryEvent nextMemoryReadEvent;
     void processNextMemoryReadEvent();
 
-    MemoryEvent nextPushEvent;
-    void processNextPushEvent();
+    EventFunctionWrapper nextPropagateEvent;
+    void processNextPropagateEvent();
 
     struct PushStats : public statistics::Group
     {

From 666ab3de782318df4c94fa1baa52c94fd11b6c13 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 29 Sep 2022 14:59:25 -0700
Subject: [PATCH 176/287] Pushing on Marjan's behalf, refactored out_port to
 vector-port.

---
 configs/accl/sega-single-simple.py  |  6 +-
 configs/accl/sega-single.py         |  4 +-
 src/accl/graph/base/data_structs.hh |  8 +--
 src/accl/graph/sega/MPU.py          |  3 +-
 src/accl/graph/sega/mpu.cc          | 85 +++++++++++++++--------------
 src/accl/graph/sega/mpu.hh          | 20 ++++---
 src/accl/graph/sega/push_engine.cc  | 64 +++++++++-------------
 src/accl/graph/sega/push_engine.hh  |  3 +-
 8 files changed, 94 insertions(+), 99 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index a87e6c53bb..92c1c9cbcb 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -92,10 +92,10 @@ def getRespPort(self):
     def setRespPort(self, port):
         self.mpu.in_port = port
 
-    def getReqPort(self):
-        return self.mpu.out_port
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
+    def getReqPort(self):
+        return self.mpu.out_ports
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
index d9fe11a781..e4f7942f42 100644
--- a/configs/accl/sega-single.py
+++ b/configs/accl/sega-single.py
@@ -92,9 +92,9 @@ def setRespPort(self, port):
         self.mpu.in_port = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d3db3edda5..34c8eb98ce 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -96,10 +96,10 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    uint64_t entrance;
-
-    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value, uint64_t entrance):
-        src(src), dst(dst), weight(weight), value(value), entrance(entrance)
+    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    {}
+    MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
+        src(src), dst(dst), weight(weight), value(value)
     {}
 
     std::string to_string()
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 1ea6a868a9..aad2e060d1 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -37,9 +37,8 @@ class MPU(ClockedObject):
     system = Param.System(Parent.any, "System this MPU is a part of")
 
     in_port = ResponsePort("Port to receive updates from outside")
-    out_port  = RequestPort("Port to send updates to the outside")
 
-    out_ports = VectorRequestPort("Ports to remote MPUs ")
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 8897e5a959..f86c7e02b7 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -42,7 +43,6 @@ MPU::MPU(const Params& params):
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
     inPort(name() + ".inPort", this),
-    outPort(name() + ".outPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -52,8 +52,9 @@ MPU::MPU(const Params& params):
 
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-
-        outports.emplace_back(name() + ".out_ports" + std::to_string(i), this);
+        outPorts.emplace_back(
+                            name() + ".outPorts" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
     }
 }
 
@@ -62,10 +63,8 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_port") {
         return inPort;
-    } else if (if_name == "out_port") {
-        return outPort;
-    } else if (if_name == "outPorts") {
-        return outports[idx];
+    } else if (if_name == "out_ports") {
+        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -76,6 +75,9 @@ MPU::init()
 {
     localAddrRange = getAddrRanges();
     inPort.sendRangeChange();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = getAddrRanges();
+    }
 }
 
 void
@@ -137,8 +139,6 @@ MPU::ReqPort::sendPacket(PacketPtr pkt)
     if (!sendTimingReq(pkt))
     {
         blockedPacket = pkt;
-    } else {
-        owner->recvReqRetry();
     }
 }
 
@@ -157,6 +157,17 @@ MPU::ReqPort::recvReqRetry()
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+MPU::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
 }
 
 bool
@@ -180,28 +191,34 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
 bool
 MPU::enqueueUpdate(Update update)
 {
-    // Creating the packet
     Addr dst_addr = update.dst;
     bool found_locally = false;
+    bool accepted = false;
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-
-    for (int i = 0; i < outports.size(); i++) {
-        AddrRangeList addrList = outports[i].getAddrRanges();
-        for (auto range : addrList) {
+    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
+                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
                 if (updateQueues[i].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
+                                        __func__, i);
                     updateQueues[i].emplace_back(update, curTick());
-                    return true;
-                } else {
-                    return false;
+                    accepted = true;
+                    break;
                 }
             }
         }
     }
 
-    panic("The update created does not match to any outport.");
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
 }
 
 template<typename T> PacketPtr
@@ -228,14 +245,19 @@ MPU::processNextUpdatePushEvent()
     int next_time_send = 0;
 
     for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
-        if (outports[i].blocked()) {
-            continue;
-        }
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outports[i].sendPacket(pkt);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
         updateQueues[i].pop_front();
         if (updateQueues[i].size() > 0) {
             next_time_send += 1;
@@ -256,25 +278,6 @@ MPU::recvVertexPush(Addr addr, WorkListItem wl)
     pushEngine->recvVertexPush(addr, wl);
 }
 
-void
-MPU::sendPacket(PacketPtr pkt)
-{
-    bool found_locally = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(pkt->getAddr());
-    }
-
-    if (found_locally) {
-        // TODO: count number of local updates
-
-    } else {
-        // TOOD: count number of remote updates
-
-    }
-
-    outPort.sendPacket(pkt);
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index d7042540f0..1a642e7873 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_MPU_HH__
 #define __ACCL_GRAPH_SEGA_MPU_HH__
 
+#include <unordered_map>
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
 #include "accl/graph/sega/push_engine.hh"
@@ -74,13 +77,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         PacketPtr blockedPacket;
+        PortID _id;
 
       public:
-        ReqPort(const std::string& name, MPU* owner) :
-          RequestPort(name, owner), owner(owner), blockedPacket(nullptr)
+        ReqPort(const std::string& name, MPU* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -95,15 +101,17 @@ class MPU : public ClockedObject
     PushEngine* pushEngine;
 
     RespPort inPort;
-    ReqPort outPort;
 
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
-    std::vector<ReqPort> outports;
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+
+    std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
+
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -133,9 +141,7 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    bool blocked() { return outPort.blocked(); }
-    void sendPacket(PacketPtr pkt);
-    void recvReqRetry() { pushEngine->recvReqRetry(); }
+    void recvReqRetry();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c82a4c88be..d533f1ea79 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -215,15 +215,18 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<MetaEdge> edges;
+    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
-        edges.emplace_back(
-            push_info.src, edge_dst, edge_weight, push_info.value, curTick());
+        MetaEdge meta_edge(
+                    push_info.src, edge_dst, edge_weight, push_info.value);
+        edges.emplace_back(meta_edge, curTick());
     }
+    assert(!edges.empty());
     edgeQueue.push_back(edges);
+
     onTheFlyMemReqs--;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
@@ -235,40 +238,44 @@ PushEngine::handleMemResp(PacketPtr pkt)
     return true;
 }
 
-// TODO: Add a parameter to allow for doing multiple pushes at the same time.
 void
 PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<MetaEdge>& edge_list = edgeQueue.front();
-        MetaEdge curr_edge = edge_list.front();
+        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
+        MetaEdge meta_edge;
+        Tick entrance_tick;
+        std::tie(meta_edge, entrance_tick) = edge_list.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
-                        __func__, curr_edge.to_string());
-
-        uint32_t update_value = propagate(curr_edge.value, curr_edge.weight);
+                                __func__, meta_edge.to_string());
 
-        Update update(curr_edge.src, curr_edge.dst, update_value);
+        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
+
         if (owner->enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sent a push update from addr: %lu to "
-                        "addr: %lu with value: %d.\n", __func__, curr_edge.src,
-                        curr_edge.dst, update_value);
+            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+                                            __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
-            (curTick() - curr_edge.entrance) * 1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.push_back(curr_edge);
+            edge_list.emplace_back(meta_edge, entrance_tick);
         }
 
-        num_propagates++;
-        if (num_propagates >= maxPropagatesPerCycle) {
+        if (edge_list.empty()) {
+            edgeQueue.pop_front();
+        }
+
+        if (edgeQueue.empty()) {
             break;
         }
 
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+        num_propagates++;
+        if (num_propagates >= maxPropagatesPerCycle) {
+            break;
         }
     }
 
@@ -278,25 +285,6 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-template<typename T> PacketPtr
-PushEngine::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(
-                addr, sizeof(T), 0, _requestorId);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) _requestorId) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index f3304a8e2a..fed6909733 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -103,11 +103,10 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<MetaEdge>> edgeQueue;
+    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     bool vertexSpace();
     bool workLeft();

From 194a5e4983af2498452daba971db27a2468148b6 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 30 Sep 2022 08:37:23 -0700
Subject: [PATCH 177/287] Attempting to add multi-inports to MPU

---
 configs/accl/sega-single-simple.py |  4 +--
 configs/accl/sega.py               | 49 ++++++++++++------------------
 src/accl/graph/sega/MPU.py         |  5 +--
 src/accl/graph/sega/mpu.cc         | 37 +++++++++++++---------
 src/accl/graph/sega/mpu.hh         | 13 ++++----
 src/accl/graph/sega/wl_engine.cc   |  2 +-
 6 files changed, 55 insertions(+), 55 deletions(-)

diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
index 92c1c9cbcb..eacb16d3d1 100644
--- a/configs/accl/sega-single-simple.py
+++ b/configs/accl/sega-single-simple.py
@@ -88,9 +88,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def setReqPort(self, port):
         self.mpu.out_ports = port
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index a67551a5fd..455d081145 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -78,18 +78,19 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
+                    push_engine=self.push_engine,
+                    update_queue_size=16
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.mpu.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.mpu.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.mpu.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
@@ -97,14 +98,7 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = '1GHz'
@@ -112,19 +106,7 @@ def __init__(
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
                                         AddrRange(start=0, size="4GiB"),
@@ -137,13 +119,18 @@ def __init__(
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -160,10 +147,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
     exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} because {exit_event.getCause()}")
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aad2e060d1..aea76db86f 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -36,7 +36,8 @@ class MPU(ClockedObject):
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_port = ResponsePort("Port to receive updates from outside")
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
 
@@ -47,5 +48,5 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int(16, "Maximum number of entries "
+    update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f86c7e02b7..4a80b22979 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -42,7 +42,6 @@ MPU::MPU(const Params& params):
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
     pushEngine(params.push_engine),
-    inPort(name() + ".inPort", this),
     updateQueueSize(params.update_queue_size),
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
 {
@@ -53,16 +52,21 @@ MPU::MPU(const Params& params):
 
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".outPorts" + std::to_string(i), this, i);
+                            name() + ".out_ports" + std::to_string(i), this, i);
         updateQueues.emplace_back();
     }
+
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
 }
 
 Port&
 MPU::getPort(const std::string& if_name, PortID idx)
 {
-    if (if_name == "in_port") {
-        return inPort;
+    if (if_name == "in_ports") {
+        return inPorts[idx];
     } else if (if_name == "out_ports") {
         return outPorts[idx];
     } else {
@@ -74,9 +78,11 @@ void
 MPU::init()
 {
     localAddrRange = getAddrRanges();
-    inPort.sendRangeChange();
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
     for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = getAddrRanges();
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
     }
 }
 
@@ -101,6 +107,14 @@ MPU::RespPort::checkRetryReq()
     }
 }
 
+void
+MPU::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 MPU::RespPort::recvTimingReq(PacketPtr pkt)
 {
@@ -197,16 +211,13 @@ MPU::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
-    DPRINTF(MPU, "%s: TESSSSTSSSS %d, %d, %llu.\n",
-                    __func__, outPorts.size(), updateQueues[0].size(), dst_addr);
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
-                if (updateQueues[i].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n",
-                                        __func__, i);
-                    updateQueues[i].emplace_back(update, curTick());
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
                     accepted = true;
                     break;
                 }
@@ -268,8 +279,6 @@ MPU::processNextUpdatePushEvent()
     if (next_time_send > 0) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
-
-
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 1a642e7873..ff17eada0e 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -56,13 +56,16 @@ class MPU : public ClockedObject
       private:
         MPU* owner;
         bool needSendRetryReq;
+        PortID _id;
 
       public:
-        RespPort(const std::string& name, MPU* owner):
-          ResponsePort(name, owner), owner(owner), needSendRetryReq(false)
+        RespPort(const std::string& name, MPU* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;
 
+        PortID id() { return _id; }
         void checkRetryReq();
 
       protected:
@@ -100,18 +103,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    RespPort inPort;
-
     AddrRangeList localAddrRange;
 
     uint32_t updateQueueSize;
 
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
 
+    std::vector<RespPort> inPorts;
     std::vector<ReqPort> outPorts;
     std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
 
-
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
     EventFunctionWrapper nextUpdatePushEvent;
@@ -129,7 +130,6 @@ class MPU : public ClockedObject
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
-    void checkRetryReq() { inPort.checkRetryReq(); }
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -142,6 +142,7 @@ class MPU : public ClockedObject
     void recvVertexPush(Addr addr, WorkListItem wl);
 
     void recvReqRetry();
+    void checkRetryReq();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5d4dd1723e..0267bd46b6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -74,7 +74,7 @@ WLEngine::reduce(uint32_t update, uint32_t value)
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    assert(updateQueue.size() <= updateQueueSize);
+    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
     if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
         return false;
     }

From cddd042f6330e0da3e36dc2f278898944eb30d31 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 09:06:36 -0700
Subject: [PATCH 178/287] Moving reqPorts from MPU to PushEngine

---
 configs/accl/sega.py               |  10 +-
 src/accl/graph/sega/MPU.py         |   4 -
 src/accl/graph/sega/PushEngine.py  |   7 +-
 src/accl/graph/sega/mpu.cc         | 136 +------------------------
 src/accl/graph/sega/mpu.hh         |  36 -------
 src/accl/graph/sega/push_engine.cc | 154 ++++++++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.hh |  36 +++++++
 7 files changed, 200 insertions(+), 183 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 455d081145..21a041180f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,7 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -78,8 +79,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.mpu = MPU(
                     wl_engine=self.wl_engine,
                     coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine,
-                    update_queue_size=16
+                    push_engine=self.push_engine
                     )
 
     def getRespPort(self):
@@ -88,9 +88,9 @@ def setRespPort(self, port):
         self.mpu.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_ports
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_ports = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.dram.range = vertex_range
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index aea76db86f..3547cb8817 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -39,8 +39,6 @@ class MPU(ClockedObject):
     in_ports = VectorResponsePort("Incoming Ports to receive updates from "
                                                 "remote outside")
 
-    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
@@ -48,5 +46,3 @@ class MPU(ClockedObject):
     push_engine = Param.PushEngine(NULL, "Internal PushEngine for each "
                                 "instance of MPU object.")
 
-    update_queue_size = Param.Int("Maximum number of entries "
-                                    "for each update queue.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 7dba86aff2..5e0d2b3212 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,6 +34,8 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
+    workload = Param.String("BFS", "Name of the workload.")
+
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -45,4 +47,7 @@ class PushEngine(BaseMemoryEngine):
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
-    workload = Param.String("BFS", "Name of the workload.")
+    update_queue_size = Param.Int("Maximum number of entries "
+                                    "for each update queue.")
+
+    out_ports = VectorRequestPort("Outgoing ports to all MPUs")
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 4a80b22979..76d7d3114f 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -41,21 +41,12 @@ MPU::MPU(const Params& params):
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
-    pushEngine(params.push_engine),
-    updateQueueSize(params.update_queue_size),
-    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name())
+    pushEngine(params.push_engine)
 {
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
 
-
-    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-        outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
-    }
-
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
         inPorts.emplace_back(
                             name() + ".in_ports" + std::to_string(i), this, i);
@@ -67,8 +58,6 @@ MPU::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "in_ports") {
         return inPorts[idx];
-    } else if (if_name == "out_ports") {
-        return outPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -77,13 +66,9 @@ MPU::getPort(const std::string& if_name, PortID idx)
 void
 MPU::init()
 {
-    localAddrRange = getAddrRanges();
     for (int i = 0; i < inPorts.size(); i++){
         inPorts[i].sendRangeChange();
     }
-    for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
-    }
 }
 
 void
@@ -144,46 +129,6 @@ MPU::RespPort::recvRespRetry()
     panic("recvRespRetry from response port is called.");
 }
 
-void
-MPU::ReqPort::sendPacket(PacketPtr pkt)
-{
-    panic_if(blockedPacket != nullptr,
-            "Should never try to send if blocked!");
-    // If we can't send the packet across the port, store it for later.
-    if (!sendTimingReq(pkt))
-    {
-        blockedPacket = pkt;
-    }
-}
-
-bool
-MPU::ReqPort::recvTimingResp(PacketPtr pkt)
-{
-    panic("recvTimingResp called on the request port.");
-}
-
-void
-MPU::ReqPort::recvReqRetry()
-{
-    panic_if(blockedPacket == nullptr,
-            "Received retry without a blockedPacket.");
-
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-    if (blockedPacket == nullptr) {
-        owner->recvReqRetry();
-    }
-}
-
-void
-MPU::recvReqRetry()
-{
-    if (!nextUpdatePushEvent.scheduled()) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
@@ -202,85 +147,6 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
-bool
-MPU::enqueueUpdate(Update update)
-{
-    Addr dst_addr = update.dst;
-    bool found_locally = false;
-    bool accepted = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(dst_addr);
-    }
-    for (int i = 0; i < outPorts.size(); i++) {
-        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(MPU, "%s: Queue %d received an update.\n", __func__, i);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    accepted = true;
-                    break;
-                }
-            }
-        }
-    }
-
-    if (accepted && (!nextUpdatePushEvent.scheduled())) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-
-    return accepted;
-}
-
-template<typename T> PacketPtr
-MPU::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) 1) << 2);
-
-    // FIXME: MemCmd::UpdateWL
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-    // pkt->setData(data);
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-MPU::processNextUpdatePushEvent()
-{
-    int next_time_send = 0;
-
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
-            continue;
-        }
-        if (outPorts[i].blocked()) {
-            continue;
-        }
-        Update update;
-        Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
-        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-        outPorts[i].sendPacket(pkt);
-        DPRINTF(MPU, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
-        if (updateQueues[i].size() > 0) {
-            next_time_send += 1;
-        }
-    }
-
-    assert(!nextUpdatePushEvent.scheduled());
-    if (next_time_send > 0) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-}
-
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ff17eada0e..4215f82d5b 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,27 +75,6 @@ class MPU : public ClockedObject
         virtual void recvRespRetry();
     };
 
-    class ReqPort : public RequestPort
-    {
-      private:
-        MPU* owner;
-        PacketPtr blockedPacket;
-        PortID _id;
-
-      public:
-        ReqPort(const std::string& name, MPU* owner, PortID id) :
-          RequestPort(name, owner), 
-          owner(owner), blockedPacket(nullptr), _id(id)
-        {}
-        void sendPacket(PacketPtr pkt);
-        bool blocked() { return (blockedPacket != nullptr); }
-        PortID id() { return _id; }
-
-      protected:
-        virtual bool recvTimingResp(PacketPtr pkt);
-        virtual void recvReqRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -103,20 +82,7 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    AddrRangeList localAddrRange;
-
-    uint32_t updateQueueSize;
-
-    std::unordered_map<PortID, AddrRangeList> portAddrMap;
-
     std::vector<RespPort> inPorts;
-    std::vector<ReqPort> outPorts;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
-
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
-
-    EventFunctionWrapper nextUpdatePushEvent;
-    void processNextUpdatePushEvent();
 
   public:
     PARAMS(MPU);
@@ -133,7 +99,6 @@ class MPU : public ClockedObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
-    bool enqueueUpdate(Update update);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
@@ -141,7 +106,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void recvReqRetry();
     void checkRetryReq();
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d533f1ea79..70c10cc358 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -44,11 +44,40 @@ PushEngine::PushEngine(const Params& params):
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     workload(params.workload),
+    updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
     nextPropagateEvent([this] { processNextPropagateEvent(); }, name()),
+    nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+        outPorts.emplace_back(
+                            name() + ".out_ports" + std::to_string(i), this, i);
+        updateQueues.emplace_back();
+    }
+}
+
+Port&
+PushEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "out_ports") {
+        return outPorts[idx];
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort(if_name, idx);
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+PushEngine::init()
+{
+    localAddrRange = owner->getAddrRanges();
+    for (int i = 0; i < outPorts.size(); i++){
+        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+    }
+}
 
 void
 PushEngine::registerMPU(MPU* mpu)
@@ -56,6 +85,46 @@ PushEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+PushEngine::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        blockedPacket = pkt;
+    }
+}
+
+bool
+PushEngine::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    panic("recvTimingResp called on the request port.");
+}
+
+void
+PushEngine::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        owner->recvReqRetry();
+    }
+}
+
+void
+PushEngine::recvReqRetry()
+{
+    if (!nextUpdatePushEvent.scheduled()) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 bool
 PushEngine::vertexSpace()
 {
@@ -255,7 +324,7 @@ PushEngine::processNextPropagateEvent()
         Update update(meta_edge.src, meta_edge.dst, update_value);
         edge_list.pop_front();
 
-        if (owner->enqueueUpdate(update)) {
+        if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
@@ -285,6 +354,87 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+PushEngine::enqueueUpdate(Update update)
+{
+    Addr dst_addr = update.dst;
+    bool found_locally = false;
+    bool accepted = false;
+    for (auto range : localAddrRange) {
+        found_locally |= range.contains(dst_addr);
+    }
+    for (int i = 0; i < outPorts.size(); i++) {
+        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
+        for (auto range : addr_range_list) {
+            if (range.contains(dst_addr)) {
+                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    accepted = true;
+                    break;
+                }
+            }
+        }
+    }
+
+    if (accepted && (!nextUpdatePushEvent.scheduled())) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+template<typename T> PacketPtr
+PushEngine::createUpdatePacket(Addr addr, T value)
+{
+    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 1) << 2);
+
+    // FIXME: MemCmd::UpdateWL
+    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
+
+    pkt->allocate();
+    // pkt->setData(data);
+    pkt->setLE<T>(value);
+
+    return pkt;
+}
+
+void
+PushEngine::processNextUpdatePushEvent()
+{
+    int next_time_send = 0;
+
+    for (int i = 0; i < updateQueues.size(); i++) {
+        if (updateQueues[i].empty()) {
+            continue;
+        }
+        if (outPorts[i].blocked()) {
+            continue;
+        }
+        Update update;
+        Tick entrance_tick;
+        std::tie(update, entrance_tick) = updateQueues[i].front();
+        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        outPorts[i].sendPacket(pkt);
+        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+                    "%d.\n", __func__, update.src, update.dst, update.value);
+        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+        if (updateQueues[i].size() > 0) {
+            next_time_send += 1;
+        }
+    }
+
+    assert(!nextUpdatePushEvent.scheduled());
+    if (next_time_send > 0) {
+        schedule(nextUpdatePushEvent, nextCycle());
+    }
+}
+
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fed6909733..99fec33f2c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -42,6 +42,27 @@ class MPU;
 class PushEngine : public BaseMemoryEngine
 {
   private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        PushEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, PushEngine* owner, PortID id) :
+          RequestPort(name, owner), 
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
     class EdgeReadInfoGen {
       private:
         Addr _start;
@@ -95,6 +116,8 @@ class PushEngine : public BaseMemoryEngine
     bool _running;
     Tick lastIdleEntranceTick;
 
+    AddrRangeList localAddrRange;
+
     int numPendingPulls;
     int edgePointerQueueSize;
     std::deque<EdgeReadInfoGen> edgePointerQueue;
@@ -108,6 +131,13 @@ class PushEngine : public BaseMemoryEngine
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);
 
+    int updateQueueSize;
+    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
+    bool enqueueUpdate(Update update);
+    std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::vector<ReqPort> outPorts;
+
     bool vertexSpace();
     bool workLeft();
 
@@ -120,6 +150,9 @@ class PushEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPropagateEvent;
     void processNextPropagateEvent();
 
+    EventFunctionWrapper nextUpdatePushEvent;
+    void processNextUpdatePushEvent();
+
     struct PushStats : public statistics::Group
     {
       PushStats(PushEngine &push);
@@ -147,6 +180,9 @@ class PushEngine : public BaseMemoryEngine
   public:
     PARAMS(PushEngine);
     PushEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }

From d2e6f2e7119437f6762f03cf93f85bdb0beb67b5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 3 Oct 2022 10:01:32 -0700
Subject: [PATCH 179/287] Moving respPorts from MPU to WLEngine

---
 configs/accl/sega.py             |  4 +-
 src/accl/graph/sega/MPU.py       |  7 +--
 src/accl/graph/sega/WLEngine.py  |  6 ++-
 src/accl/graph/sega/mpu.cc       | 79 ++-------------------------
 src/accl/graph/sega/mpu.hh       | 39 ++------------
 src/accl/graph/sega/wl_engine.cc | 93 ++++++++++++++++++++++++++++++--
 src/accl/graph/sega/wl_engine.hh | 34 ++++++++++++
 7 files changed, 140 insertions(+), 122 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 21a041180f..c6c2171315 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -83,9 +83,9 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_ports
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_ports = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
diff --git a/src/accl/graph/sega/MPU.py b/src/accl/graph/sega/MPU.py
index 3547cb8817..8d2453b01c 100644
--- a/src/accl/graph/sega/MPU.py
+++ b/src/accl/graph/sega/MPU.py
@@ -27,18 +27,15 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.objects.ClockedObject import ClockedObject
+from m5.SimObject import SimObject
 
-class MPU(ClockedObject):
+class MPU(SimObject):
     type = "MPU"
     cxx_header = "accl/graph/sega/mpu.hh"
     cxx_class = "gem5::MPU"
 
     system = Param.System(Parent.any, "System this MPU is a part of")
 
-    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
-                                                "remote outside")
-
     wl_engine = Param.WLEngine(NULL, "Internal WLEngine for each instance of "
                                 "MPU object.")
     coalesce_engine = Param.CoalesceEngine(NULL, "Internal CoalesceEngine for "
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index a44352ab9b..91325ab53f 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -34,11 +34,15 @@ class WLEngine(BaseReduceEngine):
     cxx_header = "accl/graph/sega/wl_engine.hh"
     cxx_class = 'gem5::WLEngine'
 
+    in_ports = VectorResponsePort("Incoming Ports to receive updates from "
+                                                "remote outside")
+
     update_queue_size = Param.Int("Size of the queue WLEngine stores "
                                         "the incoming updates")
+
     register_file_size = Param.Int("Number of internal registers the "
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
-                                    "entries at the same time.") # 4 is arbitrary
+                                    "entries at the same time.")
 
     workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 76d7d3114f..c8d0f636f2 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,7 +29,6 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
-#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -37,7 +36,7 @@ namespace gem5
 {
 
 MPU::MPU(const Params& params):
-    ClockedObject(params),
+    SimObject(params),
     system(params.system),
     wlEngine(params.wl_engine),
     coalesceEngine(params.coalesce_engine),
@@ -46,30 +45,10 @@ MPU::MPU(const Params& params):
     wlEngine->registerMPU(this);
     coalesceEngine->registerMPU(this);
     pushEngine->registerMPU(this);
-
-    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
-        inPorts.emplace_back(
-                            name() + ".in_ports" + std::to_string(i), this, i);
-    }
-}
-
-Port&
-MPU::getPort(const std::string& if_name, PortID idx)
-{
-    if (if_name == "in_ports") {
-        return inPorts[idx];
-    } else {
-        return ClockedObject::getPort(if_name, idx);
-    }
 }
 
-void
-MPU::init()
-{
-    for (int i = 0; i < inPorts.size(); i++){
-        inPorts[i].sendRangeChange();
-    }
-}
+MPU::~MPU()
+{}
 
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
@@ -77,58 +56,6 @@ MPU::registerCenteralController(CenteralController* centeral_controller)
     centeralController = centeral_controller;
 }
 
-AddrRangeList
-MPU::RespPort::getAddrRanges() const
-{
-    return owner->getAddrRanges();
-}
-
-void
-MPU::RespPort::checkRetryReq()
-{
-    if (needSendRetryReq) {
-        sendRetryReq();
-        needSendRetryReq = false;
-    }
-}
-
-void
-MPU::checkRetryReq()
-{
-    for (int i = 0; i < inPorts.size(); ++i) {
-        inPorts[i].checkRetryReq();
-    }
-}
-
-bool
-MPU::RespPort::recvTimingReq(PacketPtr pkt)
-{
-    if (!owner->handleIncomingUpdate(pkt)) {
-        needSendRetryReq = true;
-        return false;
-    }
-
-    return true;
-}
-
-Tick
-MPU::RespPort::recvAtomic(PacketPtr pkt)
-{
-    panic("recvAtomic unimpl.");
-}
-
-void
-MPU::RespPort::recvFunctional(PacketPtr pkt)
-{
-    owner->recvFunctional(pkt);
-}
-
-void
-MPU::RespPort::recvRespRetry()
-{
-    panic("recvRespRetry from response port is called.");
-}
-
 bool
 MPU::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 4215f82d5b..a1e5055226 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -38,8 +38,7 @@
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
 #include "mem/packet.hh"
-#include "mem/port.hh"
-#include "sim/clocked_object.hh"
+#include "sim/sim_object.hh"
 #include "sim/system.hh"
 #include "params/MPU.hh"
 
@@ -48,33 +47,9 @@ namespace gem5
 
 class CenteralController;
 
-class MPU : public ClockedObject
+class MPU : public SimObject
 {
   private:
-    class RespPort : public ResponsePort
-    {
-      private:
-        MPU* owner;
-        bool needSendRetryReq;
-        PortID _id;
-
-      public:
-        RespPort(const std::string& name, MPU* owner, PortID id):
-          ResponsePort(name, owner), 
-          owner(owner), needSendRetryReq(false), _id(id)
-        {}
-        virtual AddrRangeList getAddrRanges() const;
-
-        PortID id() { return _id; }
-        void checkRetryReq();
-
-      protected:
-        virtual bool recvTimingReq(PacketPtr pkt);
-        virtual Tick recvAtomic(PacketPtr pkt);
-        virtual void recvFunctional(PacketPtr pkt);
-        virtual void recvRespRetry();
-    };
-
     System* system;
     CenteralController* centeralController;
 
@@ -82,20 +57,16 @@ class MPU : public ClockedObject
     CoalesceEngine* coalesceEngine;
     PushEngine* pushEngine;
 
-    std::vector<RespPort> inPorts;
-
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    Port& getPort(const std::string& if_name,
-                PortID idx = InvalidPortID) override;
-    virtual void init() override;
+    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
-
     bool handleIncomingUpdate(PacketPtr pkt);
+
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
@@ -106,8 +77,6 @@ class MPU : public ClockedObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
 
-    void checkRetryReq();
-
     void recvDoneSignal();
     bool done();
 };
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0267bd46b6..9a548a3255 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -45,7 +45,30 @@ WLEngine::WLEngine(const WLEngineParams& params):
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
-{}
+{
+    for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
+        inPorts.emplace_back(
+                            name() + ".in_ports" + std::to_string(i), this, i);
+    }
+}
+
+Port&
+WLEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "in_ports") {
+        return inPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+void
+WLEngine::init()
+{
+    for (int i = 0; i < inPorts.size(); i++){
+        inPorts[i].sendRangeChange();
+    }
+}
 
 void
 WLEngine::registerMPU(MPU* mpu)
@@ -53,6 +76,70 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+AddrRangeList 
+WLEngine::getAddrRanges()
+{ 
+    return owner->getAddrRanges(); 
+}
+
+void 
+WLEngine::recvFunctional(PacketPtr pkt)
+{ 
+    owner->recvFunctional(pkt); 
+}
+
+AddrRangeList
+WLEngine::RespPort::getAddrRanges() const
+{
+    return owner->getAddrRanges();
+}
+
+void
+WLEngine::RespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        sendRetryReq();
+        needSendRetryReq = false;
+    }
+}
+
+bool
+WLEngine::RespPort::recvTimingReq(PacketPtr pkt)
+{
+    if (!owner->handleIncomingUpdate(pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+
+    return true;
+}
+
+Tick
+WLEngine::RespPort::recvAtomic(PacketPtr pkt)
+{
+    panic("recvAtomic unimpl.");
+}
+
+void
+WLEngine::RespPort::recvFunctional(PacketPtr pkt)
+{
+    owner->recvFunctional(pkt);
+}
+
+void
+WLEngine::RespPort::recvRespRetry()
+{
+    panic("recvRespRetry from response port is called.");
+}
+
+void
+WLEngine::checkRetryReq()
+{
+    for (int i = 0; i < inPorts.size(); ++i) {
+        inPorts[i].checkRetryReq();
+    }
+}
+
 bool
 WLEngine::done()
 {
@@ -144,7 +231,7 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                owner->checkRetryReq();
+                checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
             }
         } else {
@@ -173,7 +260,7 @@ WLEngine::processNextReadEvent()
                     "from updateQueue. updateQueue.size = %d. "
                     "updateQueueSize = %d.\n", __func__, update_addr,
                     update_value, updateQueue.size(), updateQueueSize);
-        owner->checkRetryReq();
+        checkRetryReq();
     }
 
     if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f888979be9..5f08678d26 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -45,8 +45,34 @@ class MPU;
 class WLEngine : public BaseReduceEngine
 {
   private:
+    class RespPort : public ResponsePort
+    {
+      private:
+        WLEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RespPort(const std::string& name, WLEngine* owner, PortID id):
+          ResponsePort(name, owner), 
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
     MPU* owner;
 
+    std::vector<RespPort> inPorts;
+
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
@@ -86,11 +112,19 @@ class WLEngine : public BaseReduceEngine
   public:
     PARAMS(WLEngine);
     WLEngine(const Params& params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+    virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    AddrRangeList getAddrRanges();
+    void recvFunctional(PacketPtr pkt);
+
     bool handleIncomingUpdate(PacketPtr pkt);
     void handleIncomingWL(Addr addr, WorkListItem wl);
 
+    void checkRetryReq();
+
     bool done();
 };
 

From 07cfd5fbb3381f8be86224e491c0eb0dc5d9da97 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 3 Oct 2022 12:58:25 -0700
Subject: [PATCH 180/287] Updating dprintfs.

---
 src/accl/graph/sega/push_engine.cc | 50 ++++++++++++++++++++++++------
 1 file changed, 40 insertions(+), 10 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70c10cc358..9039eb408d 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -109,10 +109,12 @@ PushEngine::ReqPort::recvReqRetry()
     panic_if(blockedPacket == nullptr,
             "Received retry without a blockedPacket.");
 
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
     if (blockedPacket == nullptr) {
+        DPRINTF(PushEngine, "%s: blockedPacket sent successfully.\n", __func__);
         owner->recvReqRetry();
     }
 }
@@ -120,6 +122,7 @@ PushEngine::ReqPort::recvReqRetry()
 void
 PushEngine::recvReqRetry()
 {
+    DPRINTF(PushEngine, "%s: Received a reqRetry.\n", __func__);
     if (!nextUpdatePushEvent.scheduled()) {
         schedule(nextUpdatePushEvent, nextCycle());
     }
@@ -325,7 +328,7 @@ PushEngine::processNextPropagateEvent()
         edge_list.pop_front();
 
         if (enqueueUpdate(update)) {
-            DPRINTF(PushEngine, "%s: Sending %s to port queues.\n",
+            DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numUpdates++;
             stats.edgeQueueLatency.sample(
@@ -363,14 +366,17 @@ PushEngine::enqueueUpdate(Update update)
     for (auto range : localAddrRange) {
         found_locally |= range.contains(dst_addr);
     }
+    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
         for (auto range : addr_range_list) {
             if (range.contains(dst_addr)) {
+                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
+                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
                 if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: Queue %d received an update.\n", __func__, i);
+                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
                     updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
                     accepted = true;
                     break;
                 }
@@ -408,23 +414,47 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    for (int i = 0; i < updateQueues.size(); i++) {
-        if (updateQueues[i].empty()) {
+    // for (int i = 0; i < updateQueues.size(); i++) {
+    //     if (updateQueues[i].empty()) {
+    //         continue;
+    //     }
+    //     if (outPorts[i].blocked()) {
+    //         continue;
+    //     }
+    //     Update update;
+    //     Tick entrance_tick;
+    //     std::tie(update, entrance_tick) = updateQueues[i].front();
+    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+    //     outPorts[i].sendPacket(pkt);
+    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
+    //                 "%d.\n", __func__, update.src, update.dst, update.value);
+    //     updateQueues[i].pop_front();
+    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
+    //     if (updateQueues[i].size() > 0) {
+    //         next_time_send += 1;
+    //     }
+    // }
+
+    for (int i = 0; i < outPorts.size(); i++) {
+        if (outPorts[i].blocked()) {
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
             continue;
         }
-        if (outPorts[i].blocked()) {
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        if (updateQueues[outPorts[i].id()].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
+        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[i].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-                    "%d.\n", __func__, update.src, update.dst, update.value);
-        updateQueues[i].pop_front();
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-        if (updateQueues[i].size() > 0) {
+        if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
     }

From bab798ddaa2384e934ebc1775ac5755f83affdc8 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 4 Oct 2022 12:49:29 -0700
Subject: [PATCH 181/287] Fixing the problems with retry

---
 configs/accl/sega.py               | 6 +++---
 src/accl/graph/sega/push_engine.cc | 8 ++++----
 src/accl/graph/sega/push_engine.hh | 3 ++-
 src/accl/graph/sega/wl_engine.cc   | 2 +-
 4 files changed, 10 insertions(+), 9 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c6c2171315..6b198c5f4a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,8 +48,8 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=2,
+                                register_file_size=2
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=2
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 9039eb408d..238b8a89fb 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -54,7 +54,6 @@ PushEngine::PushEngine(const Params& params):
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
                             name() + ".out_ports" + std::to_string(i), this, i);
-        updateQueues.emplace_back();
     }
 }
 
@@ -93,6 +92,7 @@ PushEngine::ReqPort::sendPacket(PacketPtr pkt)
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
+        DPRINTF(PushEngine, "%s: Packet is blocked.\n", __func__);
         blockedPacket = pkt;
     }
 }
@@ -386,7 +386,7 @@ PushEngine::enqueueUpdate(Update update)
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    }
+    } 
 
     return accepted;
 }
@@ -448,10 +448,10 @@ PushEngine::processNextUpdatePushEvent()
         DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[i].front();
+        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d.\n", __func__, outPorts[i].id(), outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
         DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 99fec33f2c..4e0cdbc526 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -132,10 +132,11 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
+    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
+    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
     std::vector<ReqPort> outPorts;
 
     bool vertexSpace();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 9a548a3255..116cdf3f77 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -98,8 +98,8 @@ void
 WLEngine::RespPort::checkRetryReq()
 {
     if (needSendRetryReq) {
-        sendRetryReq();
         needSendRetryReq = false;
+        sendRetryReq();
     }
 }
 

From 6140135bdc790a77b13d8026292874c3d91154fd Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Oct 2022 14:10:57 -0700
Subject: [PATCH 182/287] Fixing done, code style and conifg. Adding a stat.

---
 configs/accl/sega-simple.py        |  68 ++++++-------
 configs/accl/sega-single-simple.py | 151 ----------------------------
 configs/accl/sega-single.py        | 155 -----------------------------
 configs/accl/sega.py               |  14 +--
 src/accl/graph/sega/mpu.cc         |   3 -
 src/accl/graph/sega/mpu.hh         |   1 -
 src/accl/graph/sega/push_engine.cc |  97 ++++++++++--------
 src/accl/graph/sega/push_engine.hh |   4 +-
 8 files changed, 90 insertions(+), 403 deletions(-)
 delete mode 100644 configs/accl/sega-single-simple.py
 delete mode 100644 configs/accl/sega-single.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index fffc273ee1..54a90281bf 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -48,20 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64
+                                    resp_queue_size=64,
+                                    update_queue_size=16,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
@@ -88,14 +89,14 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                     )
 
     def getRespPort(self):
-        return self.mpu.in_port
+        return self.wl_engine.in_ports
     def setRespPort(self, port):
-        self.mpu.in_port = port
+        self.wl_engine.in_ports = port
 
     def getReqPort(self):
-        return self.mpu.out_port
+        return self.push_engine.out_ports
     def setReqPort(self, port):
-        self.mpu.out_port = port
+        self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
@@ -103,54 +104,39 @@ def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.image_file = edge_image
 
 class SEGA(System):
-    def __init__(
-                self,
-                num_mpus,
-                cache_size,
-                graph_path,
-                first_addr,
-                first_value
-                ):
+    def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '4GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.interconnect = NoncoherentXBar(
-                                            frontend_latency=1,
-                                            forward_latency=1,
-                                            response_latency=1,
-                                            width=64
-                                            )
-
-        self.ctrl = CenteralController(
-                                    init_addr=first_addr,
-                                    init_value=first_value,
-                                    image_file=f"{graph_path}/vertices"
-                                    )
-
-        self.ctrl.req_port = self.interconnect.cpu_side_ports
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            num_mpus,
-                                            32
-                                            )
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("8GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpt.setReqPort(self.interconnect.cpu_side_ports)
-            gpt.setRespPort(self.interconnect.mem_side_ports)
             gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -167,11 +153,13 @@ def get_inputs():
 if __name__ == "__m5_main__":
     num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
 
-    system = SEGA(num_gpts, cache_size, graph, init_addr, init_value)
+    system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
 
     m5.instantiate()
 
+    system.create_initial_bfs_update(init_addr, init_value)
+
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single-simple.py b/configs/accl/sega-single-simple.py
deleted file mode 100644
index eacb16d3d1..0000000000
--- a/configs/accl/sega-single-simple.py
+++ /dev/null
@@ -1,151 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_ports
-    def setRespPort(self, port):
-        self.mpu.in_ports = port
-
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-    def getReqPort(self):
-        return self.mpu.out_ports
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_vertex_image(self, vertex_image):
-        self.vertex_mem_ctrl.image_file = vertex_image
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_range(AddrRange("4GiB"))
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega-single.py b/configs/accl/sega-single.py
deleted file mode 100644
index e4f7942f42..0000000000
--- a/configs/accl/sega-single.py
+++ /dev/null
@@ -1,155 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=64,
-                                register_file_size=32
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                            latency="30ns",
-                                            latency_var="0ns",
-                                            bandwidth="32GiB/s"
-                                        )
-
-        self.edge_mem_ctrl = MemCtrl(
-                                    dram=DDR4_2400_8x8(
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                    )
-                                )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.mpu.in_port
-    def setRespPort(self, port):
-        self.mpu.in_port = port
-
-    def getReqPort(self):
-        return self.mpu.out_ports
-    def setReqPort(self, port):
-        self.mpu.out_ports = port
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        plain_vertex_range = AddrRange("4GiB")
-        self._vertex_ranges = interleave_addresses(
-                                            plain_vertex_range,
-                                            1,
-                                            32
-                                            )
-
-        gpts = [GPT("8GiB", cache_size)]
-        gpts[0].set_vertex_ranges(self._vertex_ranges[0])
-        gpts[0].set_edge_image(f"{graph_path}/edgelist_0")
-        gpts[0].setReqPort(gpts[0].getRespPort())
-        self.gpts = gpts
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.cache_size, args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 6b198c5f4a..fab414f2c5 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,21 +48,21 @@ class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-                                update_queue_size=2,
-                                register_file_size=2
+                                update_queue_size=128,
+                                register_file_size=64
                                 )
         self.coalesce_engine = CoalesceEngine(
                                             attached_memory_atom_size=32,
                                             cache_size=cache_size,
-                                            num_mshr_entry=32,
-                                            num_tgts_per_mshr=32,
-                                            max_resp_per_cycle=4
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=2
+                                    update_queue_size=16
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
@@ -101,7 +101,7 @@ class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '1GHz'
+        self.clk_domain.clock = '2GHz'
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index c8d0f636f2..44054d1efb 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -47,9 +47,6 @@ MPU::MPU(const Params& params):
     pushEngine->registerMPU(this);
 }
 
-MPU::~MPU()
-{}
-
 void
 MPU::registerCenteralController(CenteralController* centeral_controller)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index a1e5055226..229bd28950 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -60,7 +60,6 @@ class MPU : public SimObject
   public:
     PARAMS(MPU);
     MPU(const Params& params);
-    ~MPU();
     void registerCenteralController(CenteralController* centeral_controller);
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 238b8a89fb..5835b61fc6 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -53,7 +53,7 @@ PushEngine::PushEngine(const Params& params):
 {
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
         outPorts.emplace_back(
-                            name() + ".out_ports" + std::to_string(i), this, i);
+                        name() + ".out_ports" + std::to_string(i), this, i);
     }
 }
 
@@ -144,9 +144,12 @@ PushEngine::workLeft()
 bool
 PushEngine::done()
 {
-    return edgeQueue.empty() &&
-            (onTheFlyMemReqs == 0) &&
-            edgePointerQueue.empty();
+    bool empty_update_queues = true;
+    for (int i = 0; i < outPorts.size(); i++) {
+        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+    }
+    return empty_update_queues && edgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
 
@@ -357,6 +360,16 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
+bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool found = false;
+    for (auto range: range_list) {
+        found |= range.contains(addr);
+    }
+    return found;
+}
+
 bool
 PushEngine::enqueueUpdate(Update update)
 {
@@ -369,24 +382,32 @@ PushEngine::enqueueUpdate(Update update)
     DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
     for (int i = 0; i < outPorts.size(); i++) {
         AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        for (auto range : addr_range_list) {
-            if (range.contains(dst_addr)) {
-                DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n", __func__, update.to_string(), outPorts[i].id());
-                DPRINTF(PushEngine, "%s: There are %d updates already in queue for port %d.\n", __func__, updateQueues[outPorts[i].id()].size(), outPorts[i].id());
-                if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
-                    DPRINTF(PushEngine, "%s: There is a free entry available in queue %d.\n", __func__, outPorts[i].id());
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    DPRINTF(PushEngine, "%s: Emplaced the update at the back of queue for port %d is. Size of queue for port %d is %d.\n", __func__, outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
-                    accepted = true;
-                    break;
-                }
+        if (contains(addr_range_list, dst_addr)) {
+            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
+                        __func__, update.to_string(), outPorts[i].id());
+            DPRINTF(PushEngine, "%s: There are %d updates already "
+                        "in queue for port %d.\n", __func__,
+                        updateQueues[outPorts[i].id()].size(),
+                        outPorts[i].id());
+            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+                DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue %d.\n", __func__, outPorts[i].id());
+                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
+                            "of queue for port %d is. Size of queue "
+                            "for port %d is %d.\n", __func__,
+                            outPorts[i].id(), outPorts[i].id(),
+                            updateQueues[outPorts[i].id()].size());
+                accepted = true;
+                stats.updateQueueLength.sample(
+                                        updateQueues[outPorts[i].id()].size());
             }
         }
     }
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
-    } 
+    }
 
     return accepted;
 }
@@ -414,46 +435,31 @@ PushEngine::processNextUpdatePushEvent()
 {
     int next_time_send = 0;
 
-    // for (int i = 0; i < updateQueues.size(); i++) {
-    //     if (updateQueues[i].empty()) {
-    //         continue;
-    //     }
-    //     if (outPorts[i].blocked()) {
-    //         continue;
-    //     }
-    //     Update update;
-    //     Tick entrance_tick;
-    //     std::tie(update, entrance_tick) = updateQueues[i].front();
-    //     PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
-    //     outPorts[i].sendPacket(pkt);
-    //     DPRINTF(PushEngine, "%s: Sent update from addr: %lu to addr: %lu with value: "
-    //                 "%d.\n", __func__, update.src, update.dst, update.value);
-    //     updateQueues[i].pop_front();
-    //     DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
-    //     if (updateQueues[i].size() > 0) {
-    //         next_time_send += 1;
-    //     }
-    // }
-
     for (int i = 0; i < outPorts.size(); i++) {
         if (outPorts[i].blocked()) {
-            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
+                                __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Port %d available.\n",
+                                __func__, outPorts[i].id());
         if (updateQueues[outPorts[i].id()].empty()) {
-            DPRINTF(PushEngine, "%s: Respective queue for port %d is empty.\n", __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d is empty.\n", __func__, outPorts[i].id());
             continue;
         }
-        DPRINTF(PushEngine, "%s: Respective queue for port %d not empty.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Respective queue for port "
+                        "%d not empty.\n", __func__, outPorts[i].id());
         Update update;
         Tick entrance_tick;
         std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
         PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s from queue %d to port %d the queue size is %d.\n", __func__, update.to_string(), outPorts[i].id(), outPorts[i].id(), updateQueues[outPorts[i].id()].size());
+        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
+                    "Respective queue size is %d.\n", __func__,
+                    update.to_string(), outPorts[i].id(),
+                    updateQueues[outPorts[i].id()].size());
         updateQueues[outPorts[i].id()].pop_front();
-        DPRINTF(PushEngine, "%s: Size of queue %d is %d.\n", __func__, i, updateQueues[outPorts[i].id()].size());
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
@@ -480,7 +486,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue.")
+             "Histogram of the latency of the edgeQueue."),
+    ADD_STAT(updateQueueLength, statistics::units::Count::get(),
+             "Histogram of the length of updateQueues.")
 {
 }
 
@@ -493,6 +501,7 @@ PushEngine::PushStats::regStats()
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
+    updateQueueLength.init(64);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 4e0cdbc526..fbe527bcb6 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -51,7 +51,7 @@ class PushEngine : public BaseMemoryEngine
 
       public:
         ReqPort(const std::string& name, PushEngine* owner, PortID id) :
-          RequestPort(name, owner), 
+          RequestPort(name, owner),
           owner(owner), blockedPacket(nullptr), _id(id)
         {}
         void sendPacket(PacketPtr pkt);
@@ -132,7 +132,6 @@ class PushEngine : public BaseMemoryEngine
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
-    // std::vector<std::deque<std::tuple<Update, Tick>>> updateQueues;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
     std::unordered_map<PortID, AddrRangeList> portAddrMap;
@@ -170,6 +169,7 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram updateQueueLength;
     };
 
     PushStats stats;

From 4b555f682145b9f7dbd306ac5ff7ce47a150dc03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Oct 2022 15:35:54 -0700
Subject: [PATCH 183/287] Back indent.

---
 configs/accl/sega-simple.py | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 54a90281bf..93267f0f24 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -31,18 +31,18 @@
 from m5.objects import *
 
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(AddrRange(
+            start=plain_range.start,
+            size=plain_range.size(),
+            intlvHighBit=intlv_low_bit + intlv_bits - 1,
+            xorHighBit=0,
+            intlvBits=intlv_bits,
+            intlvMatch=i))
+    return ret
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):

From fe68447f9d5b106c6802e2cd7e5e47718c0dd83c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 7 Oct 2022 10:27:22 -0700
Subject: [PATCH 184/287] Fixed HBM range issue.

---
 configs/accl/sega-hbm.py                  | 163 ++++++++++++++++++++++
 src/accl/graph/sega/base_memory_engine.cc |  11 +-
 src/accl/graph/sega/coalesce_engine.cc    |  27 ----
 src/base/addr_range.hh                    |  44 +++---
 src/mem/HBMCtrl.py                        |   2 +
 src/mem/hbm_ctrl.cc                       |  10 +-
 src/mem/hbm_ctrl.hh                       |   3 +-
 7 files changed, 202 insertions(+), 58 deletions(-)
 create mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
new file mode 100644
index 0000000000..da7d79d7fe
--- /dev/null
+++ b/configs/accl/sega-hbm.py
@@ -0,0 +1,163 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=128,
+                                register_file_size=64
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64,
+                                    update_queue_size=16
+                                    )
+
+        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
+                                        dram_2=HBM_2000_4H_1x64())
+
+        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
+                                            range=AddrRange(edge_memory_size),
+                                            in_addr_map=False
+                                                    )
+                                    )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, num_mpus, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+
+        vertex_ranges = interleave_addresses(
+                                        AddrRange(start=0, size="4GiB"),
+                                        2*num_mpus,
+                                        32
+                                        )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
+            gpt.set_vertex_pch_bit(8)
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/base_memory_engine.cc b/src/accl/graph/sega/base_memory_engine.cc
index d9864664b1..9f704f71e9 100644
--- a/src/accl/graph/sega/base_memory_engine.cc
+++ b/src/accl/graph/sega/base_memory_engine.cc
@@ -60,13 +60,10 @@ BaseMemoryEngine::init()
 {
     AddrRangeList memory_ranges = memPort.getAddrRanges();
 
-    if (memory_ranges.size() == 2) {
-        peerMemoryRange = merge(memory_ranges.front(), memory_ranges.back());
-    } else if (memory_ranges.size() == 1) {
-        peerMemoryRange = memory_ranges.front();
-    } else {
-        panic("Received an unacceptable number of ranges from memory.");
-    }
+    assert(memory_ranges.size() == 1);
+
+    peerMemoryRange = memory_ranges.front();
+
     DPRINTF(BaseMemoryEngine, "%s: The range attached to this engine is "
             "%s. The range is %s interleaved.\n", __func__,
             peerMemoryRange.to_string(),
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0a4a041176..f4cd6a950d 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -127,15 +127,6 @@ int
 CoalesceEngine::getBlockIndex(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
@@ -145,15 +136,6 @@ int
 CoalesceEngine::getBitIndexBase(Addr addr)
 {
     assert((addr % peerMemoryAtomSize) == 0);
-    // bool found = false;
-    // Addr trimmed_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(addr)) {
-    //         trimmed_addr = range.removeIntlvBits(addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
     int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
     int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
@@ -165,16 +147,7 @@ Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
 {
     assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    // bool found = false;
     Addr trimmed_addr = index * sizeof(WorkListItem);
-    // Addr upgraded_addr;
-    // for (auto range: peerMemoryRanges) {
-    //     if (range.contains(trimmed_addr)) {
-    //         upgraded_addr = range.addIntlvBits(trimmed_addr);
-    //         found = true;
-    //     }
-    // }
-    // assert(found);
     return peerMemoryRange.addIntlvBits(trimmed_addr);
 }
 
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index a4bf581224..339fdb6c55 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -735,33 +735,37 @@ class AddrRange
     }
 
     friend AddrRange
-    merge(const AddrRange& left, const AddrRange& right)
+    mergePseudoChannelRanges(AddrRange left, AddrRange right, int pch_bit)
     {
         assert(left.interleaved());
         assert(right.interleaved());
         assert(left.mergesWith(right));
 
-        int bits_org = left.masks.size();
-        int bits_new = bits_org - 1;
-
-        int left_match = left.intlvMatch;
-        int right_match = right.intlvMatch;
-        assert(std::abs(left_match - right_match) == (1 << bits_new));
-
-        Addr last_mask = left.masks[left.masks.size() - 1];
-        int xor_high_bit_org = 0;
-        int xor_high_bit_new = 0;
-        if (!isPowerOf2(last_mask)) {
-            xor_high_bit_org = ceilLog2<Addr>(last_mask);
-            xor_high_bit_new = xor_high_bit_org - 2;
+        uint8_t old_left_match = left.intlvMatch;
+        uint8_t new_left_match = 0;
+        uint8_t old_right_match = right.intlvMatch;
+        uint8_t new_right_match = 0;
+        int new_bits = left.masks.size() - 1;
+
+        // assumption: masks is sorted in ascending order
+        std::vector<Addr> new_masks;
+        for (auto mask: left.masks) {
+            uint64_t lsb_mask = (mask ^ (mask - 1)) + 1;
+            if ((lsb_mask >> 1) != (1 << pch_bit)) {
+                new_masks.push_back(mask);
+                new_left_match |= ((old_left_match & 1) << new_bits);
+                new_left_match >>= 1;
+                new_right_match |= ((old_right_match & 1) << new_bits);
+                new_right_match >>= 1;
+            }
+            old_left_match >>= 1;
+            old_right_match >>= 1;
         }
-        int intlv_high_bit_org =
-                        ceilLog2<Addr>(last_mask ^ (1 << xor_high_bit_org));
-        int intlv_high_bit_new = intlv_high_bit_org - 2;
+        panic_if(new_left_match != new_right_match,
+                    "The two ranges can not be a pseudo channel pair "
+                    "given the pseudochannel bit position of params.pch_bit.");
 
-        int match = std::min(left_match, right_match);
-        return AddrRange(left._start, left._end, intlv_high_bit_new,
-                            xor_high_bit_new, bits_new, match);
+        return AddrRange(left._start, left._end, new_masks, new_left_match);
     }
 };
 
diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index 0c7c1ea919..f7355d4b67 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -42,6 +42,8 @@ class HBMCtrl(MemCtrl):
     # HBMCtrl has been tested with two HBM_2000_4H_1x64 interfaces
     dram_2 = Param.DRAMInterface("DRAM memory interface")
 
+    pch_bit = Param.Int("Position of PseudoChannel bit in addresses.")
+
     # For mixed traffic, HBMCtrl with HBM_2000_4H_1x64 interfaaces
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index 99618c4b5f..efd46bbd54 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -45,6 +45,7 @@ namespace memory
 
 HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     MemCtrl(p),
+    pchBit(p.pch_bit),
     retryRdReqPC1(false), retryWrReqPC1(false),
     nextReqEventPC1([this] {processNextReqEvent(pc1Int, respQueuePC1,
                          respondEventPC1, nextReqEventPC1, retryWrReqPC1);},
@@ -233,7 +234,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     bool is_pc0;
 
     // TODO: make the interleaving bit across pseudo channels a parameter
-    if (bits(pkt->getAddr(), 6) == 0) {
+    if (bits(pkt->getAddr(), pchBit) == 0) {
         is_pc0 = true;
     } else {
         is_pc0 = false;
@@ -492,8 +493,11 @@ AddrRangeList
 HBMCtrl::getAddrRanges()
 {
     AddrRangeList ranges;
-    ranges.push_back(pc0Int->getAddrRange());
-    ranges.push_back(pc1Int->getAddrRange());
+    AddrRange pc0Int_range = pc0Int->getAddrRange();
+    AddrRange pc1Int_range = pc1Int->getAddrRange();
+    ranges.push_back(
+                mergePseudoChannelRanges(pc0Int_range, pc1Int_range, pchBit)
+                    );
     return ranges;
 }
 
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index c9045f0ae7..f204b8346f 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -72,7 +72,8 @@ class HBMCtrl : public MemCtrl
     }
 
   private:
-
+    // Position of the pseudochannel bit in addresses.
+    int pchBit;
     /**
      * Remember if we have to retry a request for second pseudo channel.
      */

From d30ddb5df9c64082e10ff101b4064e41bbf41029 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 11:49:25 -0700
Subject: [PATCH 185/287] Refactoring reading edges from memory

---
 src/accl/graph/sega/push_engine.cc | 41 +++++++++++++-----------------
 src/accl/graph/sega/push_engine.hh | 10 ++++++--
 2 files changed, 25 insertions(+), 26 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 5835b61fc6..7265cec1a4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -148,7 +148,7 @@ PushEngine::done()
     for (int i = 0; i < outPorts.size(); i++) {
         empty_update_queues &= updateQueues[outPorts[i].id()].empty();
     }
-    return empty_update_queues && edgeQueue.empty() &&
+    return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
@@ -230,13 +230,13 @@ PushEngine::processNextMemoryReadEvent()
         nextMemoryReadEvent.sleep();
         return;
     }
+    Addr aligned_addr, offset;
+    int num_edges;
 
-    if (edgeQueue.size() < (edgeQueueSize - onTheFlyMemReqs)) {
-        Addr aligned_addr, offset;
-        int num_edges;
-
-        EdgeReadInfoGen &curr_info = edgePointerQueue.front();
-        std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
@@ -246,8 +246,9 @@ PushEngine::processNextMemoryReadEvent()
         reqInfoMap[pkt->req] = push_info;
 
         memPort.sendPacket(pkt);
-        onTheFlyMemReqs++;
+        onTheFlyMemReqs += num_edges;
 
+        curr_info.iterate();
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
@@ -290,19 +291,16 @@ PushEngine::handleMemResp(PacketPtr pkt)
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
-    std::deque<std::tuple<MetaEdge, Tick>> edges;
     for (int i = 0; i < push_info.numElements; i++) {
         Edge* edge = (Edge*) (pkt_data + push_info.offset + i * sizeof(Edge));
         Addr edge_dst = edge->neighbor;
         uint32_t edge_weight = edge->weight;
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
-        edges.emplace_back(meta_edge, curTick());
+        metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
-    assert(!edges.empty());
-    edgeQueue.push_back(edges);
 
-    onTheFlyMemReqs--;
+    onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
     delete pkt_data;
     delete pkt;
@@ -318,17 +316,16 @@ PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
     while(true) {
-        std::deque<std::tuple<MetaEdge, Tick>>& edge_list = edgeQueue.front();
         MetaEdge meta_edge;
         Tick entrance_tick;
-        std::tie(meta_edge, entrance_tick) = edge_list.front();
+        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
         uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
-        edge_list.pop_front();
+        metaEdgeQueue.pop_front();
 
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
@@ -337,14 +334,10 @@ PushEngine::processNextPropagateEvent()
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
-            edge_list.emplace_back(meta_edge, entrance_tick);
-        }
-
-        if (edge_list.empty()) {
-            edgeQueue.pop_front();
+            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
 
-        if (edgeQueue.empty()) {
+        if (metaEdgeQueue.empty()) {
             break;
         }
 
@@ -355,7 +348,7 @@ PushEngine::processNextPropagateEvent()
     }
 
     assert(!nextPropagateEvent.scheduled());
-    if (!edgeQueue.empty()) {
+    if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
     }
 }
@@ -486,7 +479,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
-             "Histogram of the latency of the edgeQueue."),
+             "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues.")
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index fbe527bcb6..cc087aff11 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -93,11 +93,17 @@ class PushEngine : public BaseMemoryEngine
             } else {
                 num_items = (_end - _start) / _step;
             }
-            _start = aligned_addr + _atom;
 
             return std::make_tuple(aligned_addr, offset, num_items);
         }
 
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
         bool done() { return (_start >= _end); }
 
         Addr src() { return _src; }
@@ -126,7 +132,7 @@ class PushEngine : public BaseMemoryEngine
     int onTheFlyMemReqs;
     int edgeQueueSize;
     int maxPropagatesPerCycle;
-    std::deque<std::deque<std::tuple<MetaEdge, Tick>>> edgeQueue;
+    std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
     std::string workload;
     uint32_t propagate(uint32_t value, uint32_t weight);

From 7a6ab86032f9480e0c8d733a3968aa34f8d0eea2 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 7 Oct 2022 13:33:25 -0700
Subject: [PATCH 186/287] Added statistics to calculate number of propagates
 sent

---
 src/accl/graph/sega/push_engine.cc | 10 +++++++---
 src/accl/graph/sega/push_engine.hh |  1 +
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 7265cec1a4..4b3277d3e1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -336,17 +336,18 @@ PushEngine::processNextPropagateEvent()
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
+        num_propagates++;
 
         if (metaEdgeQueue.empty()) {
             break;
         }
-
-        num_propagates++;
         if (num_propagates >= maxPropagatesPerCycle) {
             break;
         }
     }
 
+    stats.numPropagates.sample(num_propagates);
+
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
         schedule(nextPropagateEvent, nextCycle());
@@ -481,7 +482,9 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
-             "Histogram of the length of updateQueues.")
+             "Histogram of the length of updateQueues."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Histogram of number of propagates sent.")
 {
 }
 
@@ -495,6 +498,7 @@ PushEngine::PushStats::regStats()
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
+    numPropagates.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index cc087aff11..c078391420 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -176,6 +176,7 @@ class PushEngine : public BaseMemoryEngine
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
+      statistics::Histogram numPropagates;
     };
 
     PushStats stats;

From 0bd83b6cc1c661fa484ab5d0a527d0a3d1e93722 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 8 Oct 2022 16:25:41 -0700
Subject: [PATCH 187/287] Adding coalescing to pushEngine

---
 src/accl/graph/sega/push_engine.cc | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4b3277d3e1..79e5344395 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -368,6 +368,7 @@ bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
+    bool fount_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -383,7 +384,26 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            if (updateQueues[outPorts[i].id()].size() < updateQueueSize) {
+            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
+                      itr != updateQueues[outPorts[i].id()].end();
+                      itr++){
+                std::tuple curr_update = *itr;
+                if (std::get<0>(curr_update).dst == update.dst){
+                    uint32_t value = 
+                        std::min(std::get<0>(curr_update).value, update.value);
+                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
+                            "for destination %d new value: %d by comparing %d "
+                            "and %d. \n", __func__, update.dst, value,
+                            std::get<0>(curr_update).value, update.value);
+                    fount_coalescing = true;
+                    update.value = value;
+                    updateQueues[outPorts[i].id()].erase(itr);
+                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
+                    break;
+                }
+            }
+            if ((fount_coalescing == false) && 
+                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
                 updateQueues[outPorts[i].id()].emplace_back(update, curTick());
@@ -398,6 +418,7 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
+    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());

From 9f052dcf27a64d21582f48f41eb032bb1fe48464 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 8 Oct 2022 19:49:58 -0700
Subject: [PATCH 188/287] Adding function to print final answer.

---
 configs/accl/sega-hbm.py                   | 18 +++--
 configs/accl/sega-simple.py                |  2 +-
 configs/accl/sega.py                       |  2 +-
 src/accl/graph/sega/CenteralController.py  |  5 +-
 src/accl/graph/sega/centeral_controller.cc | 44 +++++++++++-
 src/accl/graph/sega/centeral_controller.hh |  3 +
 src/accl/graph/sega/push_engine.cc         | 80 ++++++++++++----------
 src/accl/graph/sega/push_engine.hh         |  9 ++-
 src/base/addr_range.hh                     | 10 +++
 9 files changed, 125 insertions(+), 48 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index da7d79d7fe..70aac6c2cb 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -61,8 +61,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=16
+                                    resp_queue_size=512,
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
@@ -136,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
 
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
@@ -143,14 +146,19 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("--verify", type=bool, help="Print final answer")
 
     args = argparser.parse_args()
 
+    verify = False
+    if not args.verify is None:
+        verify = args.verify
+
     return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
+        args.graph, args.init_addr, args.init_value, verify
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system = False, system = system)
@@ -161,3 +169,5 @@ def get_inputs():
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
index 93267f0f24..7ec19c92ae 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega-simple.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16,
+                                    update_queue_size=32,
                                     )
 
         self.vertex_mem_ctrl = SimpleMemory(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index fab414f2c5..c50c525297 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -62,7 +62,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                     push_req_queue_size=32,
                                     attached_memory_atom_size=64,
                                     resp_queue_size=64,
-                                    update_queue_size=16
+                                    update_queue_size=32
                                     )
 
         self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0721ff977c..2ba53c231f 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -41,4 +41,7 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
-    cxx_exports = [PyBindMethod("createInitialBFSUpdate")]
+    cxx_exports = [
+                    PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("printAnswerToHostSimout")
+                ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 68b88e9e77..7c89c1edea 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,9 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <iostream>
+
+#include "base/cprintf.hh"
 #include "base/loader/memory_image.hh"
 #include "base/loader/object_file.hh"
 #include "debug/CenteralController.hh"
@@ -62,7 +65,7 @@ CenteralController::initState()
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage image = object->buildImage();
-    Addr maxVertexAddr = image.maxAddr();
+    maxVertexAddr = image.maxAddr();
 
     PortProxy proxy(
     [this](PacketPtr pkt) {
@@ -97,6 +100,21 @@ CenteralController::startup()
     }
 }
 
+PacketPtr
+CenteralController::createReadPacket(Addr addr, unsigned int size)
+{
+    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
+    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
+    // bits
+    req->setPC(((Addr) 0) << 2);
+
+    // Embed it in a packet
+    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
+    pkt->allocate();
+
+    return pkt;
+}
+
 template<typename T> PacketPtr
 CenteralController::createUpdatePacket(Addr addr, T value)
 {
@@ -134,4 +152,28 @@ CenteralController::recvDoneSignal()
     }
 }
 
+void
+CenteralController::printAnswerToHostSimout()
+{
+    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    WorkListItem items[num_items];
+    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    {
+        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, addr)) {
+                mpu->recvFunctional(pkt);
+            }
+        }
+        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        for (int i = 0; i < num_items; i++) {
+            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+                                        addr, i, items[i].to_string());
+
+            std::cout << print << std::endl;
+        }
+    }
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4a4e9c7cb1..d006851e3b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -53,6 +53,7 @@ class CenteralController : public ClockedObject
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
+    PacketPtr createReadPacket(Addr addr, unsigned int size);
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
@@ -64,6 +65,8 @@ class CenteralController : public ClockedObject
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void recvDoneSignal();
+
+    void printAnswerToHostSimout();
 };
 
 }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 79e5344395..d5fb002f82 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,7 +43,6 @@ PushEngine::PushEngine(const Params& params):
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
-    workload(params.workload),
     updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
     nextMemoryReadEvent([this] { processNextMemoryReadEvent(); }, name()),
@@ -152,10 +151,23 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
+uint32_t
+PushEngine::reduce(uint32_t update, uint32_t value)
+{
+    std::string workload = params().workload;
+    uint32_t new_value;
+    if(workload == "BFS"){
+        new_value = std::min(update, value);
+    } else{
+        panic("Workload not implemented\n");
+    }
+    return new_value;
+}
 
 uint32_t
 PushEngine::propagate(uint32_t value, uint32_t weight)
 {
+    std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
         update = value + 1;
@@ -235,7 +247,7 @@ PushEngine::processNextMemoryReadEvent()
 
     EdgeReadInfoGen& curr_info = edgePointerQueue.front();
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
-    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges))) 
+    if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
         DPRINTF(PushEngine, "%s: Current packet information generated by "
                     "EdgeReadInfoGen. aligned_addr: %lu, offset: %lu, "
@@ -299,6 +311,8 @@ PushEngine::handleMemResp(PacketPtr pkt)
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
     }
+    stats.numWastefulEdgesRead +=
+                (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
@@ -330,7 +344,7 @@ PushEngine::processNextPropagateEvent()
         if (enqueueUpdate(update)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
-            stats.numUpdates++;
+            stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
         } else {
@@ -346,7 +360,7 @@ PushEngine::processNextPropagateEvent()
         }
     }
 
-    stats.numPropagates.sample(num_propagates);
+    stats.numPropagatesHist.sample(num_propagates);
 
     assert(!nextPropagateEvent.scheduled());
     if (!metaEdgeQueue.empty()) {
@@ -354,21 +368,11 @@ PushEngine::processNextPropagateEvent()
     }
 }
 
-bool
-contains(AddrRangeList range_list, Addr addr)
-{
-    bool found = false;
-    for (auto range: range_list) {
-        found |= range.contains(addr);
-    }
-    return found;
-}
-
 bool
 PushEngine::enqueueUpdate(Update update)
 {
     Addr dst_addr = update.dst;
-    bool fount_coalescing = false;
+    bool found_coalescing = false;
     bool found_locally = false;
     bool accepted = false;
     for (auto range : localAddrRange) {
@@ -384,25 +388,21 @@ PushEngine::enqueueUpdate(Update update)
                         "in queue for port %d.\n", __func__,
                         updateQueues[outPorts[i].id()].size(),
                         outPorts[i].id());
-            for (auto itr = updateQueues[outPorts[i].id()].begin(); 
-                      itr != updateQueues[outPorts[i].id()].end();
-                      itr++){
-                std::tuple curr_update = *itr;
-                if (std::get<0>(curr_update).dst == update.dst){
-                    uint32_t value = 
-                        std::min(std::get<0>(curr_update).value, update.value);
+            for (auto& entry: updateQueues[outPorts[i].id()]) {
+                Update& curr_update = std::get<0>(entry);
+                if (curr_update.dst == update.dst) {
+                    uint32_t old_value = curr_update.value;
+                    curr_update.value = reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
-                            "for destination %d new value: %d by comparing %d "
-                            "and %d. \n", __func__, update.dst, value,
-                            std::get<0>(curr_update).value, update.value);
-                    fount_coalescing = true;
-                    update.value = value;
-                    updateQueues[outPorts[i].id()].erase(itr);
-                    updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                    break;
+                            "for destination %d with new value: %d by "
+                            "coalescing %d and %d. \n", __func__, update.dst,
+                            curr_update.value, old_value, update.value);
+                    found_coalescing = true;
+                    accepted = true;
+                    stats.updateQueueCoalescions++;
                 }
             }
-            if ((fount_coalescing == false) && 
+            if ((found_coalescing == false) &&
                 (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
                 DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue %d.\n", __func__, outPorts[i].id());
@@ -418,7 +418,6 @@ PushEngine::enqueueUpdate(Update update)
             }
         }
     }
-    fount_coalescing = false;
 
     if (accepted && (!nextUpdatePushEvent.scheduled())) {
         schedule(nextUpdatePushEvent, nextCycle());
@@ -478,6 +477,7 @@ PushEngine::processNextUpdatePushEvent()
         if (updateQueues[outPorts[i].id()].size() > 0) {
             next_time_send += 1;
         }
+        stats.numUpdates++;
     }
 
     assert(!nextUpdatePushEvent.scheduled());
@@ -489,12 +489,18 @@ PushEngine::processNextUpdatePushEvent()
 PushEngine::PushStats::PushStats(PushEngine &_push)
     : statistics::Group(&_push),
     push(_push),
-    ADD_STAT(numUpdates, statistics::units::Count::get(),
-             "Number of sent updates."),
+    ADD_STAT(numPropagates, statistics::units::Count::get(),
+             "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
     ADD_STAT(numIdleCycles, statistics::units::Count::get(),
              "Number of cycles PushEngine has been idle."),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
+    ADD_STAT(numUpdates, statistics::units::Count::get(),
+             "Number of updates sent to the network."),
+    ADD_STAT(numWastefulEdgesRead, statistics::units::Count::get(),
+             "Number of wasteful edges read from edge memory."),
     ADD_STAT(TEPS, statistics::units::Rate<statistics::units::Count,
                                     statistics::units::Second>::get(),
              "Traversed Edges Per Second."),
@@ -504,7 +510,7 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Histogram of the latency of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
-    ADD_STAT(numPropagates, statistics::units::Count::get(),
+    ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
              "Histogram of number of propagates sent.")
 {
 }
@@ -514,12 +520,12 @@ PushEngine::PushStats::regStats()
 {
     using namespace statistics;
 
-    TEPS = numUpdates / simSeconds;
+    TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
     edgeQueueLatency.init(64);
     updateQueueLength.init(64);
-    numPropagates.init(push.params().max_propagates_per_cycle);
+    numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c078391420..6163ba5c27 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -134,7 +134,7 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    std::string workload;
+    uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
 
     int updateQueueSize;
@@ -167,16 +167,19 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
-      statistics::Scalar numUpdates;
+      statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
       statistics::Scalar numIdleCycles;
+      statistics::Scalar updateQueueCoalescions;
+      statistics::Scalar numUpdates;
+      statistics::Scalar numWastefulEdgesRead;
 
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
       statistics::Histogram edgeQueueLatency;
       statistics::Histogram updateQueueLength;
-      statistics::Histogram numPropagates;
+      statistics::Histogram numPropagatesHist;
     };
 
     PushStats stats;
diff --git a/src/base/addr_range.hh b/src/base/addr_range.hh
index 339fdb6c55..3c5c150b29 100644
--- a/src/base/addr_range.hh
+++ b/src/base/addr_range.hh
@@ -852,6 +852,16 @@ RangeSize(Addr start, Addr size)
     return AddrRange(start, start + size);
 }
 
+inline bool
+contains(AddrRangeList range_list, Addr addr)
+{
+    bool ret = false;
+    for (auto range: range_list) {
+        ret |= range.contains(addr);
+    }
+    return ret;
+}
+
 } // namespace gem5
 
 #endif // __BASE_ADDR_RANGE_HH__

From cc19d17fc22d22377f2d3d56c43fe981fb66f70f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 9 Oct 2022 17:15:04 -0700
Subject: [PATCH 189/287] Typos.

---
 configs/accl/real-graph-gen.py                | 74 +++++++++++++++++++
 configs/accl/sega-hbm.py                      | 14 ++--
 .../accl/{graph-gen.py => synth-graph-gen.py} |  0
 src/accl/graph/sega/centeral_controller.cc    |  2 +-
 src/accl/graph/sega/wl_engine.cc              | 12 +--
 src/accl/graph/sega/wl_engine.hh              |  2 +-
 6 files changed, 89 insertions(+), 15 deletions(-)
 create mode 100644 configs/accl/real-graph-gen.py
 rename configs/accl/{graph-gen.py => synth-graph-gen.py} (100%)

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
new file mode 100644
index 0000000000..db44c63a9a
--- /dev/null
+++ b/configs/accl/real-graph-gen.py
@@ -0,0 +1,74 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import os
+import argparse
+import subprocess
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("path", type=str, help="Path to the graph file.")
+    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+
+    args = argparser.parse_args()
+    return args.path, args.num_gpts
+
+if __name__ == "__main__":
+    graph_path, num_gpts = get_inputs()
+
+    graph_reader = os.environ.get("GRAPH_READER")
+
+    if graph_reader is None:
+        raise ValueError(f"No value for $GRAPH_READER.")
+
+    if not os.path.exists(graph_path):
+        raise ValueError(f"{graph_path} does not exist.")
+
+    graph_dir = os.path.dirname(graph_path)
+    if not "binaries" in os.listdir(graph_dir):
+        print(f"binaries directory not found in {graph_dir}")
+        os.mkdir(f"{graph_dir}/binaries")
+        print(f"Created {graph_dir}/binaries")
+
+    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
+        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
+        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
+
+    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
+    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
+        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
+            os.remove(delete.path)
+        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run([f"{graph_reader}" ,
+                        f"{graph_path}",
+                        "false",
+                        f"{num_gpts}",
+                        "32",
+                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
+        print(f"Created the graph binaries in "
+                f"{graph_dir}/binaries/gpts_{num_gpts}")
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 70aac6c2cb..cdc752f2bd 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -42,7 +42,7 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
                 xorHighBit=0,
                 intlvBits=intlv_bits,
                 intlvMatch=i))
-        return ret
+        return ret, intlv_low_bit + intlv_bits - 1
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
@@ -112,17 +112,17 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        2*num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+                                            AddrRange(start=0, size="4GiB"),
+                                            2*num_mpus,
+                                            32
+                                            )
 
         gpts = []
         for i in range(num_mpus):
             gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(8)
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
diff --git a/configs/accl/graph-gen.py b/configs/accl/synth-graph-gen.py
similarity index 100%
rename from configs/accl/graph-gen.py
rename to configs/accl/synth-graph-gen.py
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7c89c1edea..82e63d512e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -168,7 +168,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorklistItem[%lu][%d]: %s.",
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
                                         addr, i, items[i].to_string());
 
             std::cout << print << std::endl;
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 116cdf3f77..eb2006a3df 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -76,16 +76,16 @@ WLEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-AddrRangeList 
+AddrRangeList
 WLEngine::getAddrRanges()
-{ 
-    return owner->getAddrRanges(); 
+{
+    return owner->getAddrRanges();
 }
 
-void 
+void
 WLEngine::recvFunctional(PacketPtr pkt)
-{ 
-    owner->recvFunctional(pkt); 
+{
+    owner->recvFunctional(pkt);
 }
 
 AddrRangeList
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 5f08678d26..7578044cbf 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -54,7 +54,7 @@ class WLEngine : public BaseReduceEngine
 
       public:
         RespPort(const std::string& name, WLEngine* owner, PortID id):
-          ResponsePort(name, owner), 
+          ResponsePort(name, owner),
           owner(owner), needSendRetryReq(false), _id(id)
         {}
         virtual AddrRangeList getAddrRanges() const;

From 76407f72953961561a153510f3dc81723f4847e1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:07:29 -0700
Subject: [PATCH 190/287] Adding functions to move value to and from float.

---
 src/accl/graph/base/data_structs.hh | 24 +++++++++++++++++++++++-
 src/accl/graph/sega/push_engine.cc  | 13 ++++++-------
 src/accl/graph/sega/push_engine.hh  | 11 ++++-------
 3 files changed, 33 insertions(+), 15 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 34c8eb98ce..3753e10d62 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -33,6 +33,8 @@
 #include "base/intmath.hh"
 
 #include <list>
+#include <cassert>
+#include <cstring>
 
 namespace gem5
 {
@@ -96,7 +98,7 @@ struct MetaEdge {
     uint32_t weight;
     uint32_t value;
 
-    MetaEdge(): src(0), dst(0), weight(0), value(0) 
+    MetaEdge(): src(0), dst(0), weight(0), value(0)
     {}
     MetaEdge(uint64_t src, uint64_t dst, uint32_t weight, uint32_t value):
         src(src), dst(dst), weight(weight), value(value)
@@ -176,6 +178,26 @@ class UniqueFIFO
     }
 };
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d5fb002f82..cd795eaf00 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -220,10 +220,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    edgePointerQueue.emplace_back(
-                            start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr,
-                            (uint32_t) wl.prop, curTick());
+    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
+                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+    edgePointerQueue.emplace_back(info_gen, curTick());
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -245,7 +244,8 @@ PushEngine::processNextMemoryReadEvent()
     Addr aligned_addr, offset;
     int num_edges;
 
-    EdgeReadInfoGen& curr_info = edgePointerQueue.front();
+    EdgeReadInfoGen& curr_info = std::get<0>(edgePointerQueue.front());
+    Tick entrance_tick = std::get<1>(edgePointerQueue.front());
     std::tie(aligned_addr, offset, num_edges) = curr_info.nextReadPacketInfo();
     if (metaEdgeQueue.size() < (edgeQueueSize - (onTheFlyMemReqs + num_edges)))
     {
@@ -264,8 +264,7 @@ PushEngine::processNextMemoryReadEvent()
         if (curr_info.done()) {
             DPRINTF(PushEngine, "%s: Current EdgeReadInfoGen is done.\n", __func__);
             stats.edgePointerQueueLatency.sample(
-                                (curTick() - curr_info.entrance()) *
-                                1e9 / getClockFrequency());
+                    (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 6163ba5c27..acf012b24d 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -73,12 +73,11 @@ class PushEngine : public BaseMemoryEngine
         Addr _src;
         uint32_t _value;
 
-        Tick _entrance;
       public:
         EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value, Tick entrance):
-                        _start(start), _end(end), _step(step), _atom(atom),
-                        _src(src), _value(value), _entrance(entrance)
+                        size_t atom, Addr src, uint32_t value):
+                        _start(start), _end(end), _step(step),
+                        _atom(atom), _src(src), _value(value)
         {}
 
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
@@ -108,8 +107,6 @@ class PushEngine : public BaseMemoryEngine
 
         Addr src() { return _src; }
         uint32_t value() { return _value; }
-
-        Tick entrance() { return _entrance; }
     };
     struct PushInfo {
         Addr src;
@@ -126,7 +123,7 @@ class PushEngine : public BaseMemoryEngine
 
     int numPendingPulls;
     int edgePointerQueueSize;
-    std::deque<EdgeReadInfoGen> edgePointerQueue;
+    std::deque<std::tuple<EdgeReadInfoGen, Tick>> edgePointerQueue;
     std::unordered_map<RequestPtr, PushInfo> reqInfoMap;
 
     int onTheFlyMemReqs;

From 6413163e6f818ddc442e58c9302004c34bff1933 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 15:54:40 -0700
Subject: [PATCH 191/287] Adding sssp and pr.

---
 src/accl/graph/sega/CoalesceEngine.py  |  2 ++
 src/accl/graph/sega/PushEngine.py      |  3 ++
 src/accl/graph/sega/coalesce_engine.cc | 29 ++++++++++---------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/push_engine.cc     | 40 ++++++++++++++++++++++----
 src/accl/graph/sega/push_engine.hh     |  1 +
 src/accl/graph/sega/wl_engine.cc       |  8 +++++-
 7 files changed, 63 insertions(+), 21 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index f6e997f1e3..eeba279b7a 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -44,3 +44,5 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "requestor in each cycle. Used to limit b/w.")
 
     workload = Param.String("BFS", "Name of the workload")
+
+    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 5e0d2b3212..52dc0e2506 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -51,3 +51,6 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
+
+    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
+    
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index f4cd6a950d..91072a1da8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), workload(params.workload),
+    _workCount(0), numPullsReceived(0), 
+    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -110,16 +111,20 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-uint32_t
-CoalesceEngine::reduce(uint32_t update, uint32_t value)
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    uint32_t new_value;
     if(workload == "BFS"){
-        new_value = std::min(update, value);
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
     } else{
-        panic("Workload not implemented\n");
+        panic("The workload is not recognize");
     }
-    return new_value;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -639,7 +644,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (cacheBlocks[block_index].items[wl_offset].tempProp != wl.tempProp) {
+    if (applyCondition(
+            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
         cacheBlocks[block_index].items[wl_offset] = wl;
         cacheBlocks[block_index].needsApply |= true;
         // NOTE: We don't set needsWB and rely on processNextApplyEvent to
@@ -747,12 +753,7 @@ CoalesceEngine::processNextApplyEvent()
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            // NOTE: It might be the case that for workloads other than BFS,
-            // the reduce function here should be different to the reduce
-            // function defined in WLEngine. Think about the case of PR in
-            // detail.
-            uint32_t new_prop = reduce(
-                cacheBlocks[block_index].items[index].tempProp, current_prop);
+            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
             if (new_prop != current_prop) {
                 cacheBlocks[block_index].items[index].tempProp = new_prop;
                 cacheBlocks[block_index].items[index].prop = new_prop;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b1f5b1fea1..a087f37b4d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -131,6 +131,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     std::string workload;
     uint32_t reduce(uint32_t update, uint32_t value);
+    bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index cd795eaf00..c9efa03f08 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,6 +158,10 @@ PushEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+    } else if(workload == "PR"){
+        new_value = update + value;
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
         panic("Workload not implemented\n");
     }
@@ -165,19 +169,42 @@ PushEngine::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PushEngine::propagate(uint32_t value, uint32_t weight)
+PushEngine::propagate(uint32_t delta, uint32_t weight)
 {
     std::string workload = params().workload;
     uint32_t update;
     if (workload == "BFS")  {
-        update = value + 1;
-    }
-    else{
+        update = delta + 1;
+    } else if (workload == "SSSP")  {
+        update = delta + weight;
+    } else if (workload == "PR")  {
+        float float_form = writeToFloat<uint32_t>(delta);
+        float float_update = float_form * weight * params().alpha;
+        update = readFromFloat<uint32_t>(float_update);
+    } else{
         panic("The workload %s is not supported", workload);
     }
     return update;
 }
 
+uint32_t
+PushEngine::calculateValue(WorkListItem wl)
+{
+    std::string workload = params().workload;
+    uint32_t delta;
+    if (workload == "PR")  {
+        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+        delta = readFromFloat<uint32_t>(property);
+    } else if (workload == "BFS") {
+        delta = wl.prop;
+    } else if (workload == "SSSP") {
+        delta = wl.prop;
+    } else {
+        panic("Workload not supported.");
+    }
+    return delta;
+}
+
 void
 PushEngine::start()
 {
@@ -220,9 +247,11 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
+    uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, (uint32_t) wl.prop);
+                            peerMemoryAtomSize, addr, value);
     edgePointerQueue.emplace_back(info_gen, curTick());
+    
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -256,7 +285,6 @@ PushEngine::processNextMemoryReadEvent()
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
         PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
-
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index acf012b24d..c03e78851c 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,6 +133,7 @@ class PushEngine : public BaseMemoryEngine
 
     uint32_t reduce(uint32_t update, uint32_t value);
     uint32_t propagate(uint32_t value, uint32_t weight);
+    uint32_t calculateValue(WorkListItem wl);
 
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index eb2006a3df..f684650f23 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -152,8 +152,14 @@ WLEngine::reduce(uint32_t update, uint32_t value)
     uint32_t new_value;
     if(workload == "BFS"){
         new_value = std::min(update, value);
+   } else if(workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        new_value = readFromFloat<uint32_t>(float_update + float_value);
+    } else if(workload == "SSSP"){
+        new_value = std::min(update, value);
     } else{
-        panic("Workload not implemented\n");
+        panic("Workload not implemented.");
     }
     return new_value;
 }

From bdb42750389d6e308a726f2d100bb5757895e034 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 11 Oct 2022 21:23:27 -0700
Subject: [PATCH 192/287] making workload appropriate inits

---
 src/accl/graph/sega/CenteralController.py  |  1 +
 src/accl/graph/sega/centeral_controller.cc | 17 +++++---
 src/accl/graph/sega/centeral_controller.hh |  1 +
 src/accl/graph/sega/coalesce_engine.cc     | 51 +++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 5 files changed, 50 insertions(+), 22 deletions(-)

diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 2ba53c231f..ebc8281641 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,5 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 82e63d512e..9231f96379 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -71,11 +71,8 @@ CenteralController::initState()
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(pkt->getAddr())) {
-                    mpu->recvFunctional(pkt);
-                    break;
-                }
+            if (contains(range_list, pkt->getAddr())) {
+                mpu->recvFunctional(pkt);
             }
         }
     }, system->cacheLineSize());
@@ -139,6 +136,16 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createInitialPRUpdate()
+{
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
+            mpu->start();
+        }
+    }
+}
+
 void
 CenteralController::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index d006851e3b..5b0f5d6816 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -64,6 +64,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createInitialPRUpdate();
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 91072a1da8..92ad346b30 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -75,6 +75,40 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+void
+CoalesceEngine::algoInit(PacketPtr pkt)
+{
+    WorkListItem items[numElementsPerLine];
+    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    if(workload == "PR") {
+        //TODO: Add Alpha
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        for (int i = 0; i < numElementsPerLine; i++) {
+            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
+    }
+    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+}
+
+bool
+CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
+{
+    if(workload == "BFS"){
+        return update != value;
+    } else if (workload == "SSSP"){
+        return  update < value;
+    } else if (workload == "PR"){
+        float float_value = writeToFloat<uint32_t>(value);
+        float float_update = writeToFloat<uint32_t>(update);
+        return  params().thereshold <= abs(float_update - float_value);
+    } else{
+        panic("The workload is not recognize");
+    }
+}
+
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -100,6 +134,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
+        algoInit(pkt);
         memPort.sendFunctional(pkt);
     }
 }
@@ -111,22 +146,6 @@ CoalesceEngine::done()
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS"){
-        return update != value;
-    } else if (workload == "SSSP"){
-        return  update < value;
-    } else if (workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
-        panic("The workload is not recognize");
-    }
-}
-
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBlockIndex(Addr addr)
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index a087f37b4d..49ee441ed3 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -130,7 +130,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
+    void algoInit(PacketPtr pkt);
     bool applyCondition(uint32_t update, uint32_t value);
 
     MemoryEvent nextMemoryEvent;

From 5fa0c4c2376706e694afa3babbe2353baafd7440 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Oct 2022 14:41:19 -0700
Subject: [PATCH 193/287] wip for implementing prewB and prePush apply
 functions.

---
 src/accl/graph/sega/CoalesceEngine.py  |  7 ++-
 src/accl/graph/sega/WLEngine.py        |  2 +-
 src/accl/graph/sega/coalesce_engine.cc | 61 +++++++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh |  4 ++
 src/accl/graph/sega/mpu.hh             |  2 +
 src/accl/graph/sega/push_engine.hh     |  2 +
 6 files changed, 59 insertions(+), 19 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index eeba279b7a..a50a814e89 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,6 +43,11 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
+    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
     workload = Param.String("BFS", "Name of the workload")
 
-    thereshold = Param.Float('0.0001', "Score threshold for Pagerank")
+    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 91325ab53f..7fe392cc9e 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -45,4 +45,4 @@ class WLEngine(BaseReduceEngine):
                                     "many updates as this queueu has "
                                     "entries at the same time.")
 
-    workload = Param.String('BFS',"Name of the workload")
\ No newline at end of file
+    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 92ad346b30..4e1fe79899 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,7 +48,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0), 
+    _workCount(0), numPullsReceived(0),
+    postApplyWBQueueSize(params.post_apply_wb_queue_size),
     workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -67,6 +68,16 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
+
+    // TODO: Get rid of these booleans.
+    // applyBeforeWB = true;
+    // if (workload == "PR") {
+    //     applyBeforeWB = false;
+    // }
+    // applyBeforePush = false;
+    // if (workload == "PR") {
+    //     applyBeforePush = true;
+    // }
 }
 
 void
@@ -84,7 +95,7 @@ CoalesceEngine::algoInit(PacketPtr pkt)
         //TODO: Add Alpha
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(1 - 0.2);
+            items[i].tempProp = readFromFloat<uint32_t>(0);
             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
@@ -96,15 +107,15 @@ CoalesceEngine::algoInit(PacketPtr pkt)
 bool
 CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
 {
-    if(workload == "BFS"){
+    if(workload == "BFS") {
         return update != value;
-    } else if (workload == "SSSP"){
+    } else if (workload == "SSSP") {
         return  update < value;
-    } else if (workload == "PR"){
+    } else if (workload == "PR") {
         float float_value = writeToFloat<uint32_t>(value);
         float float_update = writeToFloat<uint32_t>(update);
-        return  params().thereshold <= abs(float_update - float_value);
-    } else{
+        return  params().threshold <= abs(float_update - float_value);
+    } else {
         panic("The workload is not recognize");
     }
 }
@@ -663,14 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
-    if (applyCondition(
-            wl.tempProp, cacheBlocks[block_index].items[wl_offset].tempProp)) {
-        cacheBlocks[block_index].items[wl_offset] = wl;
-        cacheBlocks[block_index].needsApply |= true;
-        // NOTE: We don't set needsWB and rely on processNextApplyEvent to
-        // set that bit.
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
+    if (applyCondition(wl.tempProp,
+                        cacheBlocks[block_index].items[wl_offset].prop)) {
+        cacheBlocks[block_index].needsApply |= true;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -773,10 +785,13 @@ CoalesceEngine::processNextApplyEvent()
         for (int index = 0; index < numElementsPerLine; index++) {
             uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
             uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (new_prop != current_prop) {
-                cacheBlocks[block_index].items[index].tempProp = new_prop;
-                cacheBlocks[block_index].items[index].prop = new_prop;
-
+            if (applyCondition(new_prop, current_prop)) {
+                if (applyBeforeWB) {
+                    cacheBlocks[block_index].items[index].tempProp = new_prop;
+                    cacheBlocks[block_index].items[index].prop = new_prop;
+                }
+                // TODO: Implement this function
+                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
@@ -1046,6 +1061,18 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
+
+            // TODO: Implement a function like this.
+            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
+            // TODO: After implementing the above function get rid of this bool
+            // if (applyBeforePush) {
+            //     cacheBlocks[block_index].items[wl_offset].prop =
+            //         cacheBlocks[block_index].items[wl_offset].tempProp;
+            // }
+            // TODO: Implement recvVertexPush2 in PushEngine.
+            // owner->recvVertexPush2(vertex_addr, delta,
+            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
+            //             cacheBlocks[block_index].items[wl_offset].degree);
             owner->recvVertexPush(
                     vertex_addr, cacheBlocks[block_index].items[wl_offset]);
             stats.verticesPushed++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 49ee441ed3..c9564ac187 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -114,11 +114,15 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    bool applyBeforeWB;
+    bool applyBeforePush;
     int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
+    int postApplyWBQueueSize;
+    std::deque<WorkListItem> postApplyWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 229bd28950..9dcb9de5d7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,6 +75,8 @@ class MPU : public SimObject
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index c03e78851c..ec0dd09e43 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,8 @@ class PushEngine : public BaseMemoryEngine
     void start();
     bool running() { return _running; }
     void recvVertexPush(Addr addr, WorkListItem wl);
+    void recvVertexPush2(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();
 

From 2e1719a6537238b64337472dd0b5b741b07bc0c3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 14 Oct 2022 16:24:09 -0700
Subject: [PATCH 194/287] Adding GraphWorkload class.

---
 configs/accl/sega-hbm.py                   |   7 +-
 src/accl/graph/base/SConscript             |   1 +
 src/accl/graph/base/data_structs.hh        |   3 +-
 src/accl/graph/base/graph_workload.cc      |  66 ++++++++++++
 src/accl/graph/base/graph_workload.hh      |  74 +++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 src/accl/graph/sega/centeral_controller.hh |   4 +
 src/accl/graph/sega/coalesce_engine.cc     |  76 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 src/accl/graph/sega/mpu.cc                 |   8 ++
 src/accl/graph/sega/mpu.hh                 |   1 +
 src/accl/graph/sega/push_engine.cc         | 115 +++++++++++----------
 src/accl/graph/sega/push_engine.hh         |   5 +-
 src/accl/graph/sega/wl_engine.cc           |  39 ++++---
 src/accl/graph/sega/wl_engine.hh           |   5 +-
 16 files changed, 302 insertions(+), 122 deletions(-)
 create mode 100644 src/accl/graph/base/graph_workload.cc
 create mode 100644 src/accl/graph/base/graph_workload.hh

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index cdc752f2bd..50fd5f3069 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -56,7 +56,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             cache_size=cache_size,
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
+                                            max_resp_per_cycle=8,
+                                            post_apply_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -135,6 +136,9 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+        
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
@@ -166,6 +170,7 @@ def get_inputs():
     m5.instantiate()
 
     system.create_initial_bfs_update(init_addr, init_value)
+    system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/base/SConscript b/src/accl/graph/base/SConscript
index 8b741abfc8..35111c34d2 100644
--- a/src/accl/graph/base/SConscript
+++ b/src/accl/graph/base/SConscript
@@ -30,3 +30,4 @@ Import("*")
 SimObject("BaseReduceEngine.py", sim_objects=["BaseReduceEngine"])
 
 Source("base_reduce_engine.cc")
+Source("graph_workload.cc")
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 3753e10d62..2d81375b63 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -32,9 +32,10 @@
 #include "base/cprintf.hh"
 #include "base/intmath.hh"
 
-#include <list>
+#include <algorithm>
 #include <cassert>
 #include <cstring>
+#include <list>
 
 namespace gem5
 {
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
new file mode 100644
index 0000000000..3d0d45b1de
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.cc
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/base/graph_workload.hh"
+
+namespace gem5 
+{
+
+uint32_t 
+BFSWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + 1;
+}
+
+bool 
+BFSWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp < wl.prop;
+}
+
+bool
+BFSWorkload::preWBApply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.degree > 0;
+}
+
+std::tuple<uint32_t, bool> 
+BFSWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t value = wl.prop;
+    return std::make_tuple(value, false);
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
new file mode 100644
index 0000000000..304b434a3d
--- /dev/null
+++ b/src/accl/graph/base/graph_workload.hh
@@ -0,0 +1,74 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+#define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
+
+#include <tuple>
+
+#include "accl/graph/base/data_structs.hh"
+
+
+namespace gem5
+{
+
+class GraphWorkload
+{
+  public:
+    GraphWorkload() {}
+    ~GraphWorkload() {}
+    virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
+    virtual bool applyCondition(WorkListItem wl) = 0;
+    virtual bool preWBApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+};
+
+class BFSWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+  public:
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        GraphWorkload(), 
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSWorkload() {}
+
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+};
+
+}
+
+#endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index ebc8281641..17badf9ec4 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,7 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createInitialBFSUpdate"),
+                    PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createInitialPRUpdate"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 9231f96379..2074f69f08 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -83,6 +83,10 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
+    for (auto mpu: mpuVector) {
+        mpu->recvWorkload(workload);
+    }
+
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
@@ -136,6 +140,12 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
     initialUpdates.push_back(update);
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createInitialPRUpdate()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 5b0f5d6816..1f1df00b4b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -32,6 +32,7 @@
 #include <vector>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "debug/FinalAnswer.hh"
@@ -47,6 +48,8 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
+    GraphWorkload* workload;
+
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -64,6 +67,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
+    void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createInitialPRUpdate();
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4e1fe79899..20bfaf8481 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -68,16 +68,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
         cacheBlocks[i] = Block(numElementsPerLine);
     }
     needsPush.reset();
-
-    // TODO: Get rid of these booleans.
-    // applyBeforeWB = true;
-    // if (workload == "PR") {
-    //     applyBeforeWB = false;
-    // }
-    // applyBeforePush = false;
-    // if (workload == "PR") {
-    //     applyBeforePush = true;
-    // }
 }
 
 void
@@ -90,9 +80,10 @@ void
 CoalesceEngine::algoInit(PacketPtr pkt)
 {
     WorkListItem items[numElementsPerLine];
-    pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+    
     if(workload == "PR") {
         //TODO: Add Alpha
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
         int bit_index_base = getBitIndexBase(pkt->getAddr());
         for (int i = 0; i < numElementsPerLine; i++) {
             items[i].tempProp = readFromFloat<uint32_t>(0);
@@ -100,25 +91,39 @@ CoalesceEngine::algoInit(PacketPtr pkt)
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
         }
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
     }
-    pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+    
 }
 
-bool
-CoalesceEngine::applyCondition(uint32_t update, uint32_t value)
-{
-    if(workload == "BFS") {
-        return update != value;
-    } else if (workload == "SSSP") {
-        return  update < value;
-    } else if (workload == "PR") {
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        return  params().threshold <= abs(float_update - float_value);
-    } else {
-        panic("The workload is not recognize");
-    }
-}
+// bool
+// CoalesceEngine::applyCondition(WorkListItem wl)
+// {
+//     if (workload == "BFS") {
+//         return wl.tempProp != wl.prop;
+//     } else if (workload == "SSSP") {
+//         return  wl.tempProp < wl.prop;
+//     } else if (workload == "PR") {
+//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
+//         float float_prop = writeToFloat<uint32_t>(wl.prop);
+//         return  params().threshold <= abs(float_prop - float_temp);
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
+
+// bool
+// CoalesceEngine::preWBApply(WorkListItem& wl)
+// {
+//     if (workload == "BFS") {
+//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
+//         wl.tempProp = new_prop;
+//         wl.prop = new_prop;
+//         return wl.degree > 0;  
+//     } else {
+//         panic("The workload is not recognized.");
+//     }
+// }
 
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
@@ -678,11 +683,10 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         cacheBlocks[block_index].needsWB |= true;
         stats.numVertexWrites++;
     }
-    if (applyCondition(wl.tempProp,
-                        cacheBlocks[block_index].items[wl_offset].prop)) {
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
     }
-    cacheBlocks[block_index].items[wl_offset] = wl;
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
@@ -783,19 +787,13 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            uint32_t current_prop = cacheBlocks[block_index].items[index].prop;
-            uint32_t new_prop = cacheBlocks[block_index].items[index].tempProp;
-            if (applyCondition(new_prop, current_prop)) {
-                if (applyBeforeWB) {
-                    cacheBlocks[block_index].items[index].tempProp = new_prop;
-                    cacheBlocks[block_index].items[index].prop = new_prop;
-                }
+            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
                 // TODO: Implement this function
-                // bool do_push =  preWBApply(cacheBlocks[block_index].items[index]);
+                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
                 int bit_index_base =
                             getBitIndexBase(cacheBlocks[block_index].addr);
 
-                if (cacheBlocks[block_index].items[index].degree > 0) {
+                if (do_push) {
                     if (needsPush[bit_index_base + index] == 0) {
                         _workCount++;
                         needsPush[bit_index_base + index] = 1;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9564ac187..3492cab9dc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -31,8 +31,9 @@
 
 #include <bitset>
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
@@ -134,8 +135,11 @@ class CoalesceEngine : public BaseMemoryEngine
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     std::string workload;
+    GraphWorkload* graphWorkload;
+
     void algoInit(PacketPtr pkt);
-    bool applyCondition(uint32_t update, uint32_t value);
+    bool applyCondition(WorkListItem wl);
+    bool preWBApply(WorkListItem& wl);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -203,6 +207,7 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
     bool recvWLRead(Addr addr);
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 44054d1efb..70f1e05f32 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -71,6 +71,14 @@ MPU::recvWLWrite(Addr addr, WorkListItem wl)
     coalesceEngine->recvWLWrite(addr, wl);
 }
 
+void
+MPU::recvWorkload(GraphWorkload* workload)
+{
+    coalesceEngine->recvWorkload(workload);
+    pushEngine->recvWorkload(workload);
+    wlEngine->recvWorkload(workload);
+}
+
 void
 MPU::recvVertexPush(Addr addr, WorkListItem wl)
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 9dcb9de5d7..8f6101c325 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -69,6 +69,7 @@ class MPU : public SimObject
     void handleIncomingWL(Addr addr, WorkListItem wl);
     bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
+    void recvWorkload(GraphWorkload* Workload);
 
     int workCount() { return coalesceEngine->workCount(); }
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c9efa03f08..a661a755b7 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,59 +151,59 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-uint32_t
-PushEngine::reduce(uint32_t update, uint32_t value)
-{
-    std::string workload = params().workload;
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-    } else if(workload == "PR"){
-        new_value = update + value;
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented\n");
-    }
-    return new_value;
-}
-
-uint32_t
-PushEngine::propagate(uint32_t delta, uint32_t weight)
-{
-    std::string workload = params().workload;
-    uint32_t update;
-    if (workload == "BFS")  {
-        update = delta + 1;
-    } else if (workload == "SSSP")  {
-        update = delta + weight;
-    } else if (workload == "PR")  {
-        float float_form = writeToFloat<uint32_t>(delta);
-        float float_update = float_form * weight * params().alpha;
-        update = readFromFloat<uint32_t>(float_update);
-    } else{
-        panic("The workload %s is not supported", workload);
-    }
-    return update;
-}
-
-uint32_t
-PushEngine::calculateValue(WorkListItem wl)
-{
-    std::string workload = params().workload;
-    uint32_t delta;
-    if (workload == "PR")  {
-        float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-        delta = readFromFloat<uint32_t>(property);
-    } else if (workload == "BFS") {
-        delta = wl.prop;
-    } else if (workload == "SSSP") {
-        delta = wl.prop;
-    } else {
-        panic("Workload not supported.");
-    }
-    return delta;
-}
+// uint32_t
+// PushEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     std::string workload = params().workload;
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//     } else if(workload == "PR"){
+//         new_value = update + value;
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented\n");
+//     }
+//     return new_value;
+// }
+
+// uint32_t
+// PushEngine::propagate(uint32_t delta, uint32_t weight)
+// {
+//     std::string workload = params().workload;
+//     uint32_t update;
+//     if (workload == "BFS")  {
+//         update = delta + 1;
+//     } else if (workload == "SSSP")  {
+//         update = delta + weight;
+//     } else if (workload == "PR")  {
+//         float float_form = writeToFloat<uint32_t>(delta);
+//         float float_update = float_form * weight * params().alpha;
+//         update = readFromFloat<uint32_t>(float_update);
+//     } else{
+//         panic("The workload %s is not supported", workload);
+//     }
+//     return update;
+// }
+
+// uint32_t
+// PushEngine::calculateValue(WorkListItem wl)
+// {
+//     std::string workload = params().workload;
+//     uint32_t delta;
+//     if (workload == "PR")  {
+//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
+//         delta = readFromFloat<uint32_t>(property);
+//     } else if (workload == "BFS") {
+//         delta = wl.prop;
+//     } else if (workload == "SSSP") {
+//         delta = wl.prop;
+//     } else {
+//         panic("Workload not supported.");
+//     }
+//     return delta;
+// }
 
 void
 PushEngine::start()
@@ -247,9 +247,9 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     Addr start_addr = wl.edgeIndex * sizeof(Edge);
     Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
 
-    uint32_t value = calculateValue(wl);
+    // uint32_t value = calculateValue(wl);
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, value);
+                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
     
     numPendingPulls--;
@@ -364,7 +364,8 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = propagate(meta_edge.value, meta_edge.weight);
+        uint32_t update_value = 
+                graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
 
@@ -419,7 +420,7 @@ PushEngine::enqueueUpdate(Update update)
                 Update& curr_update = std::get<0>(entry);
                 if (curr_update.dst == update.dst) {
                     uint32_t old_value = curr_update.value;
-                    curr_update.value = reduce(old_value, update.value);
+                    curr_update.value = graphWorkload->reduce(old_value, update.value);
                     DPRINTF(PushEngine, "%s: found a coalescing opportunity "
                             "for destination %d with new value: %d by "
                             "coalescing %d and %d. \n", __func__, update.dst,
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index ec0dd09e43..47db96d818 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,8 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
-#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -115,6 +116,7 @@ class PushEngine : public BaseMemoryEngine
         int numElements;
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     bool _running;
     Tick lastIdleEntranceTick;
@@ -194,6 +196,7 @@ class PushEngine : public BaseMemoryEngine
     virtual void init() override;
     void registerMPU(MPU* mpu);
 
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt) { memPort.sendFunctional(pkt); }
 
     void start();
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index f684650f23..86acd40b69 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -41,7 +41,6 @@ WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
     registerFileSize(params.register_file_size),
-    workload(params.workload),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
     stats(*this)
@@ -146,23 +145,23 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-uint32_t
-WLEngine::reduce(uint32_t update, uint32_t value)
-{
-    uint32_t new_value;
-    if(workload == "BFS"){
-        new_value = std::min(update, value);
-   } else if(workload == "PR"){
-        float float_value = writeToFloat<uint32_t>(value);
-        float float_update = writeToFloat<uint32_t>(update);
-        new_value = readFromFloat<uint32_t>(float_update + float_value);
-    } else if(workload == "SSSP"){
-        new_value = std::min(update, value);
-    } else{
-        panic("Workload not implemented.");
-    }
-    return new_value;
-}
+// uint32_t
+// WLEngine::reduce(uint32_t update, uint32_t value)
+// {
+//     uint32_t new_value;
+//     if(workload == "BFS"){
+//         new_value = std::min(update, value);
+//    } else if(workload == "PR"){
+//         float float_value = writeToFloat<uint32_t>(value);
+//         float float_update = writeToFloat<uint32_t>(update);
+//         new_value = readFromFloat<uint32_t>(float_update + float_value);
+//     } else if(workload == "SSSP"){
+//         new_value = std::min(update, value);
+//     } else{
+//         panic("Workload not implemented.");
+//     }
+//     return new_value;
+// }
 
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
@@ -251,7 +250,7 @@ WLEngine::processNextReadEvent()
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
         registerFile[update_addr] =
-                    reduce(update_value, registerFile[update_addr]);
+                graphWorkload->reduce(update_value, registerFile[update_addr]);
         DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                     " registerFile. registerFile[%lu] = %u.\n", __func__,
                     update_value, update_addr, registerFile[update_addr]);
@@ -310,7 +309,7 @@ WLEngine::processNextReduceEvent()
                                         addr, workListFile[addr].to_string());
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
-                    reduce(update_value, workListFile[addr].tempProp);
+            graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
                             __func__, addr, workListFile[addr].to_string());
         stats.numReduce++;
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 7578044cbf..0d0e532269 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -33,6 +33,7 @@
 #include <unordered_map>
 
 #include "accl/graph/base/base_reduce_engine.hh"
+#include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
@@ -70,7 +71,8 @@ class WLEngine : public BaseReduceEngine
     };
 
     MPU* owner;
-
+    GraphWorkload* graphWorkload;
+    
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -118,6 +120,7 @@ class WLEngine : public BaseReduceEngine
     void registerMPU(MPU* mpu);
 
     AddrRangeList getAddrRanges();
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     void recvFunctional(PacketPtr pkt);
 
     bool handleIncomingUpdate(PacketPtr pkt);

From fba3e575719072c9dec328df5c6f0603bb9d7c6f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 15 Oct 2022 16:59:05 -0700
Subject: [PATCH 195/287] Cleaning up.

---
 src/accl/graph/sega/CoalesceEngine.py      |  3 -
 src/accl/graph/sega/PushEngine.py          |  7 +--
 src/accl/graph/sega/WLEngine.py            |  2 -
 src/accl/graph/sega/centeral_controller.cc |  5 +-
 src/accl/graph/sega/coalesce_engine.cc     | 64 ++++++----------------
 src/accl/graph/sega/coalesce_engine.hh     |  8 +--
 src/accl/graph/sega/push_engine.cc         | 58 +-------------------
 src/accl/graph/sega/push_engine.hh         |  4 --
 src/accl/graph/sega/wl_engine.cc           | 18 ------
 src/accl/graph/sega/wl_engine.hh           |  6 +-
 10 files changed, 23 insertions(+), 152 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a50a814e89..d462d618e6 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -48,6 +48,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
-    workload = Param.String("BFS", "Name of the workload")
-
-    threshold = Param.Float(0.0001, "Score threshold for Pagerank")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 52dc0e2506..20c5452d43 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -34,8 +34,6 @@ class PushEngine(BaseMemoryEngine):
     cxx_header = "accl/graph/sega/push_engine.hh"
     cxx_class = 'gem5::PushEngine'
 
-    workload = Param.String("BFS", "Name of the workload.")
-
     push_req_queue_size = Param.Int("Size of the queue to "
                                     "queue push requests.")
     # resp_queue_size should probably be
@@ -43,7 +41,7 @@ class PushEngine(BaseMemoryEngine):
     resp_queue_size = Param.Int("Size of the response queue in the "
                                     "push engine where it stores the "
                                     "edges read from memory.")
-    
+
     max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
                                             "done per cycle.")
 
@@ -51,6 +49,3 @@ class PushEngine(BaseMemoryEngine):
                                     "for each update queue.")
 
     out_ports = VectorRequestPort("Outgoing ports to all MPUs")
-
-    alpha = Param.Float(0.8, "This parameter is specific to pagerank")
-    
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 7fe392cc9e..5a8ed9c9fd 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -44,5 +44,3 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.")
-
-    workload = Param.String("BFS","Name of the workload")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 2074f69f08..fd282834e9 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -55,6 +55,7 @@ CenteralController::initState()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->recvWorkload(workload);
     }
     const auto& file = params().image_file;
     if (file == "")
@@ -83,10 +84,6 @@ CenteralController::initState()
 void
 CenteralController::startup()
 {
-    for (auto mpu: mpuVector) {
-        mpu->recvWorkload(workload);
-    }
-
     while(!initialUpdates.empty()) {
         PacketPtr front = initialUpdates.front();
         for (auto mpu: mpuVector) {
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 20bfaf8481..fa5099353e 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -50,7 +50,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
     postApplyWBQueueSize(params.post_apply_wb_queue_size),
-    workload(params.workload),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -76,52 +75,22 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-void
-CoalesceEngine::algoInit(PacketPtr pkt)
-{
-    WorkListItem items[numElementsPerLine];
-    
-    if(workload == "PR") {
-        //TODO: Add Alpha
-        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        for (int i = 0; i < numElementsPerLine; i++) {
-            items[i].tempProp = readFromFloat<uint32_t>(0);
-            items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-        }
-        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-    }
-    
-}
-
-// bool
-// CoalesceEngine::applyCondition(WorkListItem wl)
-// {
-//     if (workload == "BFS") {
-//         return wl.tempProp != wl.prop;
-//     } else if (workload == "SSSP") {
-//         return  wl.tempProp < wl.prop;
-//     } else if (workload == "PR") {
-//         float float_temp = writeToFloat<uint32_t>(wl.tempProp);
-//         float float_prop = writeToFloat<uint32_t>(wl.prop);
-//         return  params().threshold <= abs(float_prop - float_temp);
-//     } else {
-//         panic("The workload is not recognized.");
-//     }
-// }
-
-// bool
-// CoalesceEngine::preWBApply(WorkListItem& wl)
+// void
+// CoalesceEngine::algoInit(PacketPtr pkt)
 // {
-//     if (workload == "BFS") {
-//         uint32_t new_prop = std::min(wl.tempProp, wl.prop);
-//         wl.tempProp = new_prop;
-//         wl.prop = new_prop;
-//         return wl.degree > 0;  
-//     } else {
-//         panic("The workload is not recognized.");
+//     WorkListItem items[numElementsPerLine];
+
+//     if(workload == "PR") {
+//         //TODO: Add Alpha
+//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+//         int bit_index_base = getBitIndexBase(pkt->getAddr());
+//         for (int i = 0; i < numElementsPerLine; i++) {
+//             items[i].tempProp = readFromFloat<uint32_t>(0);
+//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//         }
+//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 //     }
 // }
 
@@ -150,7 +119,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        algoInit(pkt);
+        // TODO: Add and implement init function for GraphWorkload.
+        // graphWorkload->init(pkt);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 3492cab9dc..0a2c0ca5ff 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -103,6 +103,7 @@ class CoalesceEngine : public BaseMemoryEngine
       SenderState(bool is_retry): isRetry(is_retry) {}
     };
     MPU* owner;
+    GraphWorkload* graphWorkload;
 
     int numLines;
     int numElementsPerLine;
@@ -134,13 +135,6 @@ class CoalesceEngine : public BaseMemoryEngine
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
-    std::string workload;
-    GraphWorkload* graphWorkload;
-
-    void algoInit(PacketPtr pkt);
-    bool applyCondition(WorkListItem wl);
-    bool preWBApply(WorkListItem& wl);
-
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a661a755b7..c54f19307f 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -151,60 +151,6 @@ PushEngine::done()
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
 }
 
-// uint32_t
-// PushEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     std::string workload = params().workload;
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//     } else if(workload == "PR"){
-//         new_value = update + value;
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented\n");
-//     }
-//     return new_value;
-// }
-
-// uint32_t
-// PushEngine::propagate(uint32_t delta, uint32_t weight)
-// {
-//     std::string workload = params().workload;
-//     uint32_t update;
-//     if (workload == "BFS")  {
-//         update = delta + 1;
-//     } else if (workload == "SSSP")  {
-//         update = delta + weight;
-//     } else if (workload == "PR")  {
-//         float float_form = writeToFloat<uint32_t>(delta);
-//         float float_update = float_form * weight * params().alpha;
-//         update = readFromFloat<uint32_t>(float_update);
-//     } else{
-//         panic("The workload %s is not supported", workload);
-//     }
-//     return update;
-// }
-
-// uint32_t
-// PushEngine::calculateValue(WorkListItem wl)
-// {
-//     std::string workload = params().workload;
-//     uint32_t delta;
-//     if (workload == "PR")  {
-//         float property = writeToFloat<uint32_t>(wl.prop) / wl.degree;
-//         delta = readFromFloat<uint32_t>(property);
-//     } else if (workload == "BFS") {
-//         delta = wl.prop;
-//     } else if (workload == "SSSP") {
-//         delta = wl.prop;
-//     } else {
-//         panic("Workload not supported.");
-//     }
-//     return delta;
-// }
-
 void
 PushEngine::start()
 {
@@ -251,7 +197,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
     EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
                             peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
-    
+
     numPendingPulls--;
     if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
@@ -364,7 +310,7 @@ PushEngine::processNextPropagateEvent()
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
-        uint32_t update_value = 
+        uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
         Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 47db96d818..1112176897 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -133,10 +133,6 @@ class PushEngine : public BaseMemoryEngine
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
-    uint32_t reduce(uint32_t update, uint32_t value);
-    uint32_t propagate(uint32_t value, uint32_t weight);
-    uint32_t calculateValue(WorkListItem wl);
-
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
     bool enqueueUpdate(Update update);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 86acd40b69..85fe9be2ca 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -145,24 +145,6 @@ WLEngine::done()
     return registerFile.empty() && updateQueue.empty();
 }
 
-// uint32_t
-// WLEngine::reduce(uint32_t update, uint32_t value)
-// {
-//     uint32_t new_value;
-//     if(workload == "BFS"){
-//         new_value = std::min(update, value);
-//    } else if(workload == "PR"){
-//         float float_value = writeToFloat<uint32_t>(value);
-//         float float_update = writeToFloat<uint32_t>(update);
-//         new_value = readFromFloat<uint32_t>(float_update + float_value);
-//     } else if(workload == "SSSP"){
-//         new_value = std::min(update, value);
-//     } else{
-//         panic("Workload not implemented.");
-//     }
-//     return new_value;
-// }
-
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 0d0e532269..f442d6060e 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -72,7 +72,7 @@ class WLEngine : public BaseReduceEngine
 
     MPU* owner;
     GraphWorkload* graphWorkload;
-    
+
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
@@ -81,12 +81,8 @@ class WLEngine : public BaseReduceEngine
     int registerFileSize;
     std::unordered_map<Addr, uint32_t> registerFile;
     std::unordered_map<Addr, Tick> vertexReadTime;
-
     std::unordered_map<Addr, WorkListItem> workListFile;
 
-    std::string workload;
-    uint32_t reduce(uint32_t update, uint32_t value);
-
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
 

From 01ab8f8809451179d27f3f5da7be57675161f4e7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 16 Oct 2022 17:05:07 -0700
Subject: [PATCH 196/287] Implementing post push wb buffer.

---
 src/accl/graph/base/graph_workload.cc  |  19 +-
 src/accl/graph/base/graph_workload.hh  |   6 +-
 src/accl/graph/sega/CoalesceEngine.py  |   2 +-
 src/accl/graph/sega/coalesce_engine.cc | 239 +++++++++++++++++--------
 src/accl/graph/sega/coalesce_engine.hh |  10 +-
 src/accl/graph/sega/mpu.cc             |  12 +-
 src/accl/graph/sega/mpu.hh             |   4 +-
 src/accl/graph/sega/push_engine.cc     |  17 +-
 src/accl/graph/sega/push_engine.hh     |  23 ++-
 9 files changed, 223 insertions(+), 109 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3d0d45b1de..6a8e000515 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,10 +28,10 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
-namespace gem5 
+namespace gem5
 {
 
-uint32_t 
+uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
     return std::min(update, value);
@@ -43,7 +43,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
     return value + 1;
 }
 
-bool 
+bool
 BFSWorkload::applyCondition(WorkListItem wl)
 {
     return wl.tempProp < wl.prop;
@@ -52,15 +52,20 @@ BFSWorkload::applyCondition(WorkListItem wl)
 bool
 BFSWorkload::preWBApply(WorkListItem& wl)
 {
-    wl.prop = wl.tempProp;
-    return wl.degree > 0;
+    if (applyCondition(wl)) {
+        wl.prop = wl.tempProp;
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
 }
 
-std::tuple<uint32_t, bool> 
+std::tuple<uint32_t, bool, bool>
 BFSWorkload::prePushApply(WorkListItem& wl)
 {
     uint32_t value = wl.prop;
-    return std::make_tuple(value, false);
+    return std::make_tuple(value, true, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 304b434a3d..c4db5c9e2f 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -46,7 +46,7 @@ class GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -56,7 +56,7 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(), 
+        GraphWorkload(),
         initAddr(init_addr), initValue(init_value)
     {}
 
@@ -66,7 +66,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool> prePushApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
 };
 
 }
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index d462d618e6..1fd3b968c5 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -43,7 +43,7 @@ class CoalesceEngine(BaseMemoryEngine):
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
-    post_apply_wb_queue_size = Param.Int("Maximum number of pending wb after "
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fa5099353e..0c223a8a5b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -49,16 +49,17 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
     _workCount(0), numPullsReceived(0),
-    postApplyWBQueueSize(params.post_apply_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextApplyEvent([this] {
-        processNextApplyEvent();
-        }, name() + ".nextApplyEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -469,7 +470,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     onTheFlyReqs--;
     Addr addr = pkt->getAddr();
     int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
 
+    bool do_wb = false;
     if (pkt->findNextSenderState<SenderState>()) {
         assert(!((cacheBlocks[block_index].addr == addr) &&
                 (cacheBlocks[block_index].valid)));
@@ -480,7 +483,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                 "for addr %lu.\n", __func__, addr);
         int it = getBitIndexBase(addr);
         uint64_t send_mask = pendingVertexPullReads[addr];
-        WorkListItem* items = pkt->getPtr<WorkListItem>();
         // No applying of the line needed.
         for (int i = 0; i < numElementsPerLine; i++) {
             Addr vertex_addr = addr + i * sizeof(WorkListItem);
@@ -489,19 +491,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
                 _workCount--;
-                owner->recvVertexPush(vertex_addr, items[i]);
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    owner->recvPrevPullCorrection();
+                }
+
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pendingVertexPullReads.erase(addr);
-        delete pkt;
-        return true;
+        maxPotentialPostPushWB--;
     }
 
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
-                "fill cacheBlocks[%d].\n", __func__, block_index);
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
         assert(!cacheBlocks[block_index].valid);
@@ -512,19 +525,30 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         assert(!cacheBlocks[block_index].pendingApply);
         assert(!cacheBlocks[block_index].pendingWB);
         assert(MSHR.find(block_index) != MSHR.end());
-        pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
-                                                peerMemoryAtomSize);
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
-        DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                            __func__, block_index, i,
-                            cacheBlocks[block_index].items[i].to_string());
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                                __func__, block_index, i,
+                                cacheBlocks[block_index].items[i].to_string());
         }
         cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        delete pkt;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
@@ -570,6 +594,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+    delete pkt;
     return true;
 }
 
@@ -675,8 +700,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
                             "applyQueue.\n", __func__, block_index);
             if ((!applyQueue.empty()) &&
-                (!nextApplyEvent.scheduled())) {
-                schedule(nextApplyEvent, nextCycle());
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
             }
         } else {
             assert(MSHR.size() <= numMSHREntries);
@@ -742,7 +767,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 }
 
 void
-CoalesceEngine::processNextApplyEvent()
+CoalesceEngine::processNextPreWBApplyEvent()
 {
     int block_index = applyQueue.front();
     DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
@@ -757,27 +782,22 @@ CoalesceEngine::processNextApplyEvent()
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (graphWorkload->applyCondition(cacheBlocks[block_index].items[index])) {
-                // TODO: Implement this function
-                bool do_push =  graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-                int bit_index_base =
-                            getBitIndexBase(cacheBlocks[block_index].addr);
-
-                if (do_push) {
-                    if (needsPush[bit_index_base + index] == 0) {
-                        _workCount++;
-                        needsPush[bit_index_base + index] = 1;
-                        activeBits.push_back(bit_index_base + index);
-                        if (!owner->running()) {
-                            owner->start();
-                        }
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    _workCount++;
+                    needsPush[bit_index_base + index] = 1;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
                     }
                 }
             }
         }
         stats.bitvectorLength.sample(needsPush.count());
 
-        cacheBlocks[block_index].needsWB = true;
+        assert(cacheBlocks[block_index].needsWB);
         cacheBlocks[block_index].needsApply = false;
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].lastChangedTick = curTick();
@@ -810,8 +830,8 @@ CoalesceEngine::processNextApplyEvent()
 
     applyQueue.pop_front();
     if ((!applyQueue.empty()) &&
-        (!nextApplyEvent.scheduled())) {
-        schedule(nextApplyEvent, nextCycle());
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
     }
 
     if (done()) {
@@ -870,16 +890,78 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].pendingData);
     assert(!cacheBlocks[block_index].pendingApply);
     assert(!cacheBlocks[block_index].pendingWB);
-    PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                    peerMemoryAtomSize);
-    DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-            "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-    memPort.sendPacket(pkt);
-    onTheFlyReqs++;
-
-    if (pendingVertexPullReads.find(pkt->getAddr()) !=
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
         pendingVertexPullReads.end()) {
-        stats.numDoubleMemReads++;
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
     }
 }
 
@@ -948,6 +1030,18 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
     }
 }
 
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
 std::tuple<BitStatus, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
@@ -1017,6 +1111,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             assert(vertex_send_mask == 0);
             send_mask |= (1 << index_offset);
             pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_CACHE) {
             // renaming the outputs to their local names.
@@ -1030,35 +1125,39 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             needsPush[slice_base_index + wl_offset] = 0;
             _workCount--;
 
-            // TODO: Implement a function like this.
-            // uint32_t delta, bool do_wb = prePushApply(cacheBlocks[block_index].items[wl_offset]);
-            // TODO: After implementing the above function get rid of this bool
-            // if (applyBeforePush) {
-            //     cacheBlocks[block_index].items[wl_offset].prop =
-            //         cacheBlocks[block_index].items[wl_offset].tempProp;
-            // }
-            // TODO: Implement recvVertexPush2 in PushEngine.
-            // owner->recvVertexPush2(vertex_addr, delta,
-            //             cacheBlocks[block_index].items[wl_offset].edgeIndex,
-            //             cacheBlocks[block_index].items[wl_offset].degree);
-            owner->recvVertexPush(
-                    vertex_addr, cacheBlocks[block_index].items[wl_offset]);
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
             stats.verticesPushed++;
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
         }
         if (bit_status == BitStatus::IN_MEMORY) {
-            Addr addr = location;
-            int index_offset = offset;
-            uint64_t send_mask = (1 << index_offset);
-            assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-            SenderState* sender_state = new SenderState(true);
-            pkt->pushSenderState(sender_state);
-            memPort.sendPacket(pkt);
-            onTheFlyReqs++;
-            pendingVertexPullReads[addr] = send_mask;
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
         }
-        numPullsReceived--;
     }
 
     stats.bitvectorSearchStatus[bit_status]++;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 0a2c0ca5ff..c0091a494d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -123,14 +123,15 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
     std::deque<int> activeBits;
-    int postApplyWBQueueSize;
-    std::deque<WorkListItem> postApplyWBQueue;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
     std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
 
+    int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
     std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
@@ -140,14 +141,15 @@ class CoalesceEngine : public BaseMemoryEngine
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
     void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
     std::deque<std::tuple<
         std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextApplyEvent;
-    void processNextApplyEvent();
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index 70f1e05f32..b91aa21a53 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/mpu.hh"
 
 #include "accl/graph/sega/centeral_controller.hh"
+#include "debug/MPU.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
 
@@ -80,9 +81,16 @@ MPU::recvWorkload(GraphWorkload* workload)
 }
 
 void
-MPU::recvVertexPush(Addr addr, WorkListItem wl)
+MPU::recvVertexPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
 {
-    pushEngine->recvVertexPush(addr, wl);
+    pushEngine->recvVertexPush(addr, delta, edge_index, degree);
+}
+
+void
+MPU::recvPrevPullCorrection()
+{
+    DPRINTF(MPU, "%s: Fuck!\n", __func__);
 }
 
 void
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f6101c325..8f3b29f603 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -75,9 +75,9 @@ class MPU : public SimObject
     void recvVertexPull() { return coalesceEngine->recvVertexPull(); }
     bool running() { return pushEngine->running(); }
     void start() { return pushEngine->start(); }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c54f19307f..c76567696e 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -184,18 +184,18 @@ PushEngine::processNextVertexPullEvent()
 }
 
 void
-PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
+PushEngine::recvVertexPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
 {
-    assert(wl.degree > 0);
+    assert(degree > 0);
     assert((edgePointerQueueSize == 0) ||
             ((edgePointerQueue.size() + numPendingPulls) <= edgePointerQueueSize));
 
-    Addr start_addr = wl.edgeIndex * sizeof(Edge);
-    Addr end_addr = start_addr + (wl.degree * sizeof(Edge));
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
 
-    // uint32_t value = calculateValue(wl);
-    EdgeReadInfoGen info_gen(start_addr, end_addr, sizeof(Edge),
-                            peerMemoryAtomSize, addr, wl.prop);
     edgePointerQueue.emplace_back(info_gen, curTick());
 
     numPendingPulls--;
@@ -207,6 +207,7 @@ PushEngine::recvVertexPush(Addr addr, WorkListItem wl)
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+
 }
 
 void
@@ -229,7 +230,7 @@ PushEngine::processNextMemoryReadEvent()
                     "num_edges: %d.\n", __func__, aligned_addr, offset, num_edges);
 
         PacketPtr pkt = createReadPacket(aligned_addr, peerMemoryAtomSize);
-        PushInfo push_info = {curr_info.src(), curr_info.value(), offset, num_edges};
+        PushInfo push_info = {curr_info.src(), curr_info.delta(), offset, num_edges};
         reqInfoMap[pkt->req] = push_info;
         memPort.sendPacket(pkt);
         onTheFlyMemReqs += num_edges;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 1112176897..848c93e313 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -66,21 +66,24 @@ class PushEngine : public BaseMemoryEngine
 
     class EdgeReadInfoGen {
       private:
+        Addr _src;
+        uint32_t _delta;
+
         Addr _start;
         Addr _end;
         size_t _step;
         size_t _atom;
 
-        Addr _src;
-        uint32_t _value;
-
       public:
-        EdgeReadInfoGen(Addr start, Addr end, size_t step,
-                        size_t atom, Addr src, uint32_t value):
-                        _start(start), _end(end), _step(step),
-                        _atom(atom), _src(src), _value(value)
+        EdgeReadInfoGen(Addr src, uint32_t delta, Addr start,
+                        Addr end, size_t step, size_t atom):
+                        _src(src), _delta(delta), _start(start),
+                        _end(end), _step(step), _atom(atom)
         {}
 
+        Addr src() { return _src; }
+        uint32_t delta() { return _delta; }
+
         std::tuple<Addr, Addr, int> nextReadPacketInfo()
         {
             panic_if(done(), "Should not call nextPacketInfo when done.\n");
@@ -105,9 +108,6 @@ class PushEngine : public BaseMemoryEngine
         }
 
         bool done() { return (_start >= _end); }
-
-        Addr src() { return _src; }
-        uint32_t value() { return _value; }
     };
     struct PushInfo {
         Addr src;
@@ -197,8 +197,7 @@ class PushEngine : public BaseMemoryEngine
 
     void start();
     bool running() { return _running; }
-    void recvVertexPush(Addr addr, WorkListItem wl);
-    void recvVertexPush2(Addr addr, uint32_t delta,
+    void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
 
     void recvReqRetry();

From 932aec66eb6997d2be580eb711f299ee41d1559b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Oct 2022 08:40:47 -0700
Subject: [PATCH 197/287] Implementing correction function for PushEngine.

---
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 src/accl/graph/sega/mpu.cc             | 2 +-
 src/accl/graph/sega/push_engine.cc     | 9 +++++++++
 src/accl/graph/sega/push_engine.hh     | 1 +
 4 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0c223a8a5b..441457f2e8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -501,9 +501,9 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                     owner->recvVertexPush(vertex_addr, delta,
                                         items[i].edgeIndex, items[i].degree);
                 } else {
+                    // TODO: Add a stat to count this.
                     owner->recvPrevPullCorrection();
                 }
-
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
@@ -548,7 +548,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             schedule(nextMemoryEvent, nextCycle());
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Fuck 2.\n", __func__);
+        // TODO: Add a stat to count this.
+        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
     for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b91aa21a53..b30060238d 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -90,7 +90,7 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
 void
 MPU::recvPrevPullCorrection()
 {
-    DPRINTF(MPU, "%s: Fuck!\n", __func__);
+    pushEngine->recvPrevPullCorrection();
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index c76567696e..07f37a28dc 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -207,7 +207,16 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
         (!nextMemoryReadEvent.scheduled())) {
         schedule(nextMemoryReadEvent, nextCycle());
     }
+}
 
+void
+PushEngine::recvPrevPullCorrection()
+{
+    assert(numPendingPulls > 0);
+    numPendingPulls--;
+    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+        schedule(nextVertexPullEvent, nextCycle());
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 848c93e313..2e1de25390 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -199,6 +199,7 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvPrevPullCorrection();
 
     void recvReqRetry();
 

From 60ea8db3c1de4536d384c9b03e782db5739bf7b9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Wed, 19 Oct 2022 08:03:16 -0700
Subject: [PATCH 198/287] Adding initialization to graphWorkloads

---
 configs/accl/sega-hbm.py                   |  4 +-
 src/accl/graph/base/data_structs.hh        |  2 +
 src/accl/graph/base/graph_workload.cc      | 72 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      | 44 +++++++++++--
 src/accl/graph/sega/centeral_controller.cc | 22 ++-----
 src/accl/graph/sega/centeral_controller.hh |  7 ++-
 src/accl/graph/sega/coalesce_engine.cc     |  3 +-
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 8 files changed, 128 insertions(+), 28 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 50fd5f3069..9078c185f3 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -57,7 +57,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                                             num_mshr_entry=64,
                                             num_tgts_per_mshr=64,
                                             max_resp_per_cycle=8,
-                                            post_apply_wb_queue_size=64
+                                            post_push_wb_queue_size=64
                                             )
         self.push_engine = PushEngine(
                                     push_req_queue_size=32,
@@ -136,7 +136,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
     def create_initial_bfs_update(self, init_addr, init_value):
         self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-        
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 2d81375b63..70babf5960 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -37,6 +37,8 @@
 #include <cstring>
 #include <list>
 
+#define MAX_BITVECTOR_SIZE (1 << 28)
+
 namespace gem5
 {
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6a8e000515..542f2e0221 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -31,6 +31,37 @@
 namespace gem5
 {
 
+BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
+    GraphWorkload(), initValue(init_value), atomSize(atom_size)
+{
+    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
+    initIndex = (init_addr - initAddrBase) / atomSize;
+    numElementsPerLine = atomSize / sizeof(WorkListItem);
+}
+
+
+void
+BFSWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::deque<int>& activeBits)
+{
+    if (pkt->getAddr() == initAddrBase) {
+        WorkListItem items[numElementsPerLine];
+
+        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+
+        items[initIndex].tempProp = initValue;
+        items[initIndex].prop = initValue;
+        needsPush[bit_index_base + initIndex] = 1;
+        activeBits.push_back(bit_index_base + initIndex);
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+    }
+
+}
+
 uint32_t
 BFSWorkload::reduce(uint32_t update, uint32_t value)
 {
@@ -68,4 +99,45 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return update+value;
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return (alpha*value*weight);
+}
+
+bool
+PRWorkload::applyCondition(WorkListItem wl)
+{
+    return wl.tempProp != wl.prop;
+}
+
+bool
+PRWorkload::preWBApply(WorkListItem& wl)
+{
+    if (applyCondition(wl)) {
+        if (wl.degree > 0) {
+            return true;
+        }
+    }
+    return false;
+}
+
+std::tuple<uint32_t, bool, bool>
+PRWorkload::prePushApply(WorkListItem& wl)
+{
+    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    if (delta > threshold) {
+        return std::make_tuple(delta, true, true);
+    }
+    uint32_t value = wl.tempProp;
+    return std::make_tuple(value, false, false);
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c4db5c9e2f..cc0767305a 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -29,9 +29,13 @@
 #ifndef __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 #define  __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__
 
+#include <bitset>
+#include <deque>
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "base/intmath.hh"
+#include "mem/packet.hh"
 
 
 namespace gem5
@@ -42,6 +46,10 @@ class GraphWorkload
   public:
     GraphWorkload() {}
     ~GraphWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -52,16 +60,42 @@ class GraphWorkload
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddr;
+    uint64_t initAddrBase;
+    int initIndex;
     uint32_t initValue;
+    int numElementsPerLine;
+    int atomSize;
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value):
-        GraphWorkload(),
-        initAddr(init_addr), initValue(init_value)
-    {}
+    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual bool applyCondition(WorkListItem wl);
+    virtual bool preWBApply(WorkListItem& wl);
+    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+};
+
+
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+  public:
+    PRWorkload(float alpha, float threshold):
+        GraphWorkload(), alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, int bit_index_base,
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fd282834e9..dbd1705e8a 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -51,12 +51,13 @@ CenteralController::CenteralController(const Params& params):
 }
 
 void
-CenteralController::initState()
+CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->recvWorkload(workload);
     }
+
     const auto& file = params().image_file;
     if (file == "")
         return;
@@ -79,22 +80,11 @@ CenteralController::initState()
     }, system->cacheLineSize());
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
-}
 
-void
-CenteralController::startup()
-{
-    while(!initialUpdates.empty()) {
-        PacketPtr front = initialUpdates.front();
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            for (auto range: range_list) {
-                if (range.contains(front->getAddr())) {
-                    mpu->handleIncomingUpdate(front);
-                }
-            }
+    for (auto mpu: mpuVector) {
+        if (!mpu->running() && (mpu->workCount ()> 0)) {
+            mpu->start();
         }
-        initialUpdates.pop_front();
     }
 }
 
@@ -140,7 +130,7 @@ CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value);
+    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 1f1df00b4b..4c5ff28ebe 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -48,8 +48,6 @@ class CenteralController : public ClockedObject
   private:
     System* system;
 
-    GraphWorkload* workload;
-
     Addr maxVertexAddr;
     std::deque<PacketPtr> initialUpdates;
 
@@ -60,10 +58,13 @@ class CenteralController : public ClockedObject
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
+
+    GraphWorkload* workload;
+
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
 
-    virtual void initState() override;
+    // virtual void initState() override;
     virtual void startup() override;
 
     void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 441457f2e8..b91b92c0fb 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -121,7 +121,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         }
     } else {
         // TODO: Add and implement init function for GraphWorkload.
-        // graphWorkload->init(pkt);
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
         memPort.sendFunctional(pkt);
     }
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c0091a494d..926caf46db 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -38,7 +38,7 @@
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
+
 
 namespace gem5
 {

From 9b91fb71245587cfbd95e11bab0d767e571d69f3 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 22 Oct 2022 12:36:32 -0700
Subject: [PATCH 199/287] Fixing algo start issue.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 2 +-
 src/accl/graph/sega/coalesce_engine.hh     | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index dbd1705e8a..61ad7c10b4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,7 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount ()> 0)) {
+        if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
     }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b91b92c0fb..72ceba6f89 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1079,7 +1079,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if (cacheBlocks[block_index].addr != addr) {
+            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 926caf46db..8c187f8fb8 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -209,7 +209,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return _workCount; }
+    int workCount() { return needsPush.count(); }
     void recvVertexPull();
 
     bool done();

From d4644cea189cf0deb4b7714018b2a14153c10d7b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 22 Oct 2022 13:49:41 -0700
Subject: [PATCH 200/287] Fixing block addr initialization.

---
 src/accl/graph/sega/coalesce_engine.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 72ceba6f89..5b5374873c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -267,7 +267,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // is cold and addr or aligned_addr is 0. It fails because cache block
         // addr field is initialized to 0. Unfortunately Addr type is unsigned.
         // So you can not initialized addr to -1.
-        // assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         if (MSHR.find(block_index) == MSHR.end()) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8c187f8fb8..e710553be1 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -73,7 +73,7 @@ class CoalesceEngine : public BaseMemoryEngine
         // Tick lastWLWriteTick;
         Block() {}
         Block(int num_elements):
-          addr(0),
+          addr(-1),
           busyMask(0),
           valid(false),
           needsApply(false),

From e2f68af811ad9a16c5d84aa678d1baf2208f9fe1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 21:43:33 -0700
Subject: [PATCH 201/287] Adding PR.

---
 src/accl/graph/base/graph_workload.cc      | 48 ++++++++++++++++++----
 src/accl/graph/base/graph_workload.hh      | 15 ++++---
 src/accl/graph/sega/CenteralController.py  |  3 +-
 src/accl/graph/sega/centeral_controller.cc | 32 +--------------
 src/accl/graph/sega/centeral_controller.hh |  8 +---
 src/accl/graph/sega/coalesce_engine.cc     | 27 ++----------
 src/accl/graph/sega/coalesce_engine.hh     |  3 --
 7 files changed, 57 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 542f2e0221..cbaef86a76 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -36,13 +36,13 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 {
     initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
     initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = atomSize / sizeof(WorkListItem);
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
 }
 
 
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                 std::deque<int>& activeBits)
 {
     if (pkt->getAddr() == initAddrBase) {
@@ -99,23 +99,53 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+{
+    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+}
+
+void
+PRWorkload::init(PacketPtr pkt, int bit_index_base,
+                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+                std::deque<int>& activeBits)
+{
+    WorkListItem items[numElementsPerLine];
+
+    pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    for (int i = 0; i < numElementsPerLine; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        needsPush[bit_index_base + i] = 1;
+        activeBits.push_back(bit_index_base + i);
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, atomSize);
+}
 
 uint32_t
 PRWorkload::reduce(uint32_t update, uint32_t value)
 {
-    return update+value;
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
 }
 
 uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
-    return (alpha*value*weight);
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
 bool
 PRWorkload::applyCondition(WorkListItem wl)
 {
-    return wl.tempProp != wl.prop;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return temp_float != prop_float;
 }
 
 bool
@@ -132,12 +162,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
 {
-    uint32_t delta = abs(wl.prop - wl.tempProp)/wl.degree;
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = abs((temp_float - prop_float) / wl.degree);
     if (delta > threshold) {
+        wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
-    uint32_t value = wl.tempProp;
-    return std::make_tuple(value, false, false);
+    return std::make_tuple(0, false, false);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index cc0767305a..831da97e71 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,7 @@ class GraphWorkload
     ~GraphWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
@@ -65,13 +65,14 @@ class BFSWorkload : public GraphWorkload
     uint32_t initValue;
     int numElementsPerLine;
     int atomSize;
+
   public:
     BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
 
     ~BFSWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
@@ -86,15 +87,17 @@ class PRWorkload : public GraphWorkload
   private:
     float alpha;
     float threshold;
+
+    int numElementsPerLine;
+    int atomSize;
+
   public:
-    PRWorkload(float alpha, float threshold):
-        GraphWorkload(), alpha(alpha), threshold(threshold)
-    {}
+    PRWorkload(float alpha, float threshold, int atom_size);
 
     ~PRWorkload() {}
 
     virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush, 
+                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
                     std::deque<int>& activeBits);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 17badf9ec4..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,8 +42,7 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
-                    PyBindMethod("createInitialBFSUpdate"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createInitialPRUpdate"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 61ad7c10b4..57198450d4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -103,30 +103,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-template<typename T> PacketPtr
-CenteralController::createUpdatePacket(Addr addr, T value)
-{
-    RequestPtr req = std::make_shared<Request>(addr, sizeof(T), addr, value);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) value) << 2);
-
-    PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
-
-    pkt->allocate();
-
-    pkt->setLE<T>(value);
-
-    return pkt;
-}
-
-void
-CenteralController::createInitialBFSUpdate(Addr init_addr, uint32_t init_value)
-{
-    PacketPtr update = createUpdatePacket<uint32_t>(init_addr, init_value);
-    initialUpdates.push_back(update);
-}
-
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
@@ -134,13 +110,9 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 }
 
 void
-CenteralController::createInitialPRUpdate()
+CenteralController::createPRWorkload(float alpha, float threshold)
 {
-    for (auto mpu: mpuVector) {
-        if (!mpu->running() && (mpu->workCount() > 0)) {
-            mpu->start();
-        }
-    }
+    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 4c5ff28ebe..9ddb1b35f0 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -49,13 +49,11 @@ class CenteralController : public ClockedObject
     System* system;
 
     Addr maxVertexAddr;
-    std::deque<PacketPtr> initialUpdates;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
-    template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
 
   public:
 
@@ -63,13 +61,11 @@ class CenteralController : public ClockedObject
 
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
-
-    // virtual void initState() override;
     virtual void startup() override;
 
-    void createInitialBFSUpdate(Addr init_addr, uint32_t init_value);
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createInitialPRUpdate();
+    void createPRWorkload(float alpha, float threshold);
+
     void recvDoneSignal();
 
     void printAnswerToHostSimout();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 5b5374873c..e71cc1195f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -48,8 +48,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
     maxRespPerCycle(params.max_resp_per_cycle),
-    _workCount(0), numPullsReceived(0),
-    postPushWBQueueSize(params.post_push_wb_queue_size),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -76,25 +75,6 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
-// void
-// CoalesceEngine::algoInit(PacketPtr pkt)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     if(workload == "PR") {
-//         //TODO: Add Alpha
-//         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-//         int bit_index_base = getBitIndexBase(pkt->getAddr());
-//         for (int i = 0; i < numElementsPerLine; i++) {
-//             items[i].tempProp = readFromFloat<uint32_t>(0);
-//             items[i].prop = readFromFloat<uint32_t>(1 - 0.2);
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//         }
-//         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
-//     }
-// }
-
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -491,7 +471,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
-                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -550,6 +529,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     } else {
         // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
         DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
     }
 
@@ -788,7 +769,6 @@ CoalesceEngine::processNextPreWBApplyEvent()
             if (do_push) {
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
-                    _workCount++;
                     needsPush[bit_index_base + index] = 1;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
@@ -1125,7 +1105,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index e710553be1..c8fec38e5b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -116,9 +116,6 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    bool applyBeforeWB;
-    bool applyBeforePush;
-    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;

From bb31571e3cab67431ddbd146174997e87716b00b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 23 Oct 2022 22:14:05 -0700
Subject: [PATCH 202/287] Prepping for PR.

---
 configs/accl/sega-hbm.py               | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc |  2 ++
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
index 9078c185f3..1c9276f0a0 100644
--- a/configs/accl/sega-hbm.py
+++ b/configs/accl/sega-hbm.py
@@ -134,12 +134,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
 
@@ -169,8 +169,8 @@ def get_inputs():
 
     m5.instantiate()
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    system.create_bfs_workload(init_addr, init_value)
+    # system.create_bfs_workload(init_addr, init_value)
+    system.create_pr_workload(0.2, 0.0000001)
     exit_event = m5.simulate()
     print(f"Exited simulation at tick {m5.curTick()} " + \
             f"because {exit_event.getCause()}")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e71cc1195f..2d5445093a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -577,6 +577,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
+
+    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }

From 9c1f57e6d82ebbf5d3dd7b23e8a5cb0912fb04b4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 25 Oct 2022 13:52:56 -0700
Subject: [PATCH 203/287] Adding print function to GraphWorkload class.

---
 src/accl/graph/base/data_structs.hh        | 21 -----------
 src/accl/graph/base/graph_workload.cc      | 44 ++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh      |  4 +-
 src/accl/graph/sega/centeral_controller.cc |  4 +-
 4 files changed, 49 insertions(+), 24 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 70babf5960..d9028e2f10 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,6 @@
 
 #include <algorithm>
 #include <cassert>
-#include <cstring>
 #include <list>
 
 #define MAX_BITVECTOR_SIZE (1 << 28)
@@ -181,26 +180,6 @@ class UniqueFIFO
     }
 };
 
-template<typename T>
-float
-writeToFloat(T value)
-{
-    assert(sizeof(T) == sizeof(float));
-    float float_form;
-    std::memcpy(&float_form, &value, sizeof(float));
-    return float_form;
-}
-
-template<typename T>
-T
-readFromFloat(float value)
-{
-    assert(sizeof(T) == sizeof(float));
-    T float_bits;
-    std::memcpy(&float_bits, &value, sizeof(float));
-    return float_bits;
-}
-
 }
 
 #endif // __ACCL_GRAPH_BASE_DATA_STRUCTS_HH__
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index cbaef86a76..ead32c0eb8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -28,9 +28,34 @@
 
 #include "accl/graph/base/graph_workload.hh"
 
+#include <cstring>
+
+#include "base/cprintf.hh"
+#include "base/intmath.hh"
+
 namespace gem5
 {
 
+template<typename T>
+float
+writeToFloat(T value)
+{
+    assert(sizeof(T) == sizeof(float));
+    float float_form;
+    std::memcpy(&float_form, &value, sizeof(float));
+    return float_form;
+}
+
+template<typename T>
+T
+readFromFloat(float value)
+{
+    assert(sizeof(T) == sizeof(float));
+    T float_bits;
+    std::memcpy(&float_bits, &value, sizeof(float));
+    return float_bits;
+}
+
 BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
     GraphWorkload(), initValue(init_value), atomSize(atom_size)
 {
@@ -99,6 +124,15 @@ BFSWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(value, true, false);
 }
 
+std::string
+BFSWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
 {
@@ -172,4 +206,14 @@ PRWorkload::prePushApply(WorkListItem& wl)
     return std::make_tuple(0, false, false);
 }
 
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            temp_float, temp_float, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 831da97e71..c391a80c23 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,7 +34,6 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
-#include "base/intmath.hh"
 #include "mem/packet.hh"
 
 
@@ -55,6 +54,7 @@ class GraphWorkload
     virtual bool applyCondition(WorkListItem wl) = 0;
     virtual bool preWBApply(WorkListItem& wl) = 0;
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
@@ -79,6 +79,7 @@ class BFSWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
@@ -104,6 +105,7 @@ class PRWorkload : public GraphWorkload
     virtual bool applyCondition(WorkListItem wl);
     virtual bool preWBApply(WorkListItem& wl);
     virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 }
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 57198450d4..fc2262e111 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -144,8 +144,8 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            std::string print = csprintf("WorkListItem[%lu][%d]: %s.",
-                                        addr, i, items[i].to_string());
+            std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
+                                        workload->printWorkListItem(items[i]));
 
             std::cout << print << std::endl;
         }

From 95c676bd0ec2ddacf512945b4de454bd91f52f6c Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 25 Oct 2022 16:48:11 -0700
Subject: [PATCH 204/287] Updating PR

---
 src/accl/graph/base/graph_workload.cc  | 36 +++++++++--------
 src/accl/graph/sega/coalesce_engine.cc | 53 ++++++++++++++++----------
 src/accl/graph/sega/wl_engine.cc       | 10 ++---
 3 files changed, 58 insertions(+), 41 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ead32c0eb8..9f7e5fc4c5 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -77,8 +77,10 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
 
         items[initIndex].tempProp = initValue;
         items[initIndex].prop = initValue;
-        needsPush[bit_index_base + initIndex] = 1;
-        activeBits.push_back(bit_index_base + initIndex);
+        if (items[initIndex].degree > 0) {
+            needsPush[bit_index_base + initIndex] = 1;
+            activeBits.push_back(bit_index_base + initIndex);
+        }
 
         pkt->deleteData();
         pkt->allocate();
@@ -150,8 +152,10 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
     for (int i = 0; i < numElementsPerLine; i++) {
         items[i].tempProp = readFromFloat<uint32_t>(0);
         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        needsPush[bit_index_base + i] = 1;
-        activeBits.push_back(bit_index_base + i);
+        if (items[i].degree > 0) {
+            needsPush[bit_index_base + i] = 1;
+            activeBits.push_back(bit_index_base + i);
+        }
     }
     pkt->deleteData();
     pkt->allocate();
@@ -170,7 +174,7 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
+    float weight_float = writeToFloat<uint32_t>(1);
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -179,27 +183,27 @@ PRWorkload::applyCondition(WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    return temp_float != prop_float;
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
 }
 
 bool
 PRWorkload::preWBApply(WorkListItem& wl)
 {
-    if (applyCondition(wl)) {
-        if (wl.degree > 0) {
-            return true;
-        }
+    if (applyCondition(wl) && (wl.degree > 0)) {
+        return true;
     }
     return false;
 }
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = abs((temp_float - prop_float) / wl.degree);
-    if (delta > threshold) {
+{ 
+    if (applyCondition(wl)) {
+        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+        float prop_float = writeToFloat<uint32_t>(wl.prop);
+        float delta = (temp_float - prop_float) / wl.degree;
+        std::cout << "PRWorkload: delta: " << delta << std::endl;
         wl.prop = wl.tempProp;
         return std::make_tuple(delta, true, true);
     }
@@ -211,7 +215,7 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %u, degree: %u, edgeIndex: %u}",
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, temp_float, wl.degree, wl.edgeIndex
             );
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2d5445093a..0d1eecf43f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -157,7 +157,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                         "%lu, and wl_offset: %d.\n", __func__, addr,
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
+                        block_index, cacheBlocks[block_index].to_string());
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
@@ -176,15 +176,17 @@ CoalesceEngine::recvWLRead(Addr addr)
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
         DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
-                        responseQueue.size());
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
         // If they are scheduled for apply and WB those schedules should be
@@ -476,6 +478,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
+                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -508,8 +511,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
         for (int i = 0; i < numElementsPerLine; i++) {
             DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                                __func__, block_index, i,
-                                cacheBlocks[block_index].items[i].to_string());
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
         }
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
@@ -550,12 +553,14 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, miss_addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                         "to responseQueue. responseQueue.size = %d.\n",
                         __func__, addr,
-                        cacheBlocks[block_index].items[wl_offset].to_string(),
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -603,7 +608,9 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, worklist_response.to_string(), addr_response);
+                    __func__, 
+                    graphWorkload->printWorkListItem(worklist_response), 
+                    addr_response);
 
         responseQueue.pop_front();
         DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
@@ -640,12 +647,13 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
                         "wl: %s. This request maps to cacheBlocks[%d], "
                         "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, wl.to_string(),
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
                         block_index, aligned_addr, wl_offset);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, wl.to_string(), addr);
+                "with Addr: %lu.\n", __func__, 
+                graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
@@ -666,13 +674,15 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
         cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
     cacheBlocks[block_index].lastChangedTick = curTick();
     DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
                 __func__, block_index, wl_offset,
-                cacheBlocks[block_index].items[wl_offset].to_string());
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
@@ -899,12 +909,14 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                                 "to responseQueue. responseQueue.size = %d.\n",
                                 __func__, miss_addr,
-                                cacheBlocks[block_index].items[wl_offset].to_string(),
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
                                 responseQueue.size());
                     // TODO: Add a stat to count the number of WLItems that have been touched.
                     cacheBlocks[block_index].busyMask |= (1 << wl_offset);
@@ -1061,7 +1073,7 @@ CoalesceEngine::getOptimalPullAddr()
                 return std::make_tuple(
                             BitStatus::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
-            } else if ((!cacheBlocks[block_index].valid) || (cacheBlocks[block_index].addr != addr)) {
+            } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
                             BitStatus::IN_MEMORY, addr, index_offset);
@@ -1112,6 +1124,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
+            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 85fe9be2ca..a698f2cc0a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -263,10 +263,10 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
     workListFile[addr] = wl;
     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
     DPRINTF(WLEngine, "%s: Added (addr: %lu, wl: %s) to "
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
-                                    wl.to_string(), workListFile.size());
+                graphWorkload->printWorkListItem(wl), workListFile.size());
 
     stats.vertexReadLatency.sample(
         ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
@@ -287,13 +287,13 @@ WLEngine::processNextReduceEvent()
         uint32_t update_value = registerFile[addr];
         DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
                     ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-                                        __func__, addr, registerFile[addr],
-                                        addr, workListFile[addr].to_string());
+                    __func__, addr, registerFile[addr], addr,
+                    graphWorkload->printWorkListItem(workListFile[addr]));
         // TODO: Generalize this to reduce function rather than just min
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-                            __func__, addr, workListFile[addr].to_string());
+        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
         stats.numReduce++;
 
         owner->recvWLWrite(addr, workListFile[addr]);

From 166c3ac21df0a8175334dc8c426309e603d81b03 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:11:05 -0700
Subject: [PATCH 205/287] Updating configs for pr and bfs. Fixing bugs for pr.

---
 configs/accl/bfs.py                    |  78 +++++++++++
 configs/accl/pr.py                     |  78 +++++++++++
 configs/accl/real-graph-gen.py         |  41 ++++--
 configs/accl/sega-hbm.py               | 178 -------------------------
 configs/accl/sega.py                   | 137 +++++++++----------
 configs/accl/synth-graph-gen.py        |  88 ++++++++----
 src/accl/graph/base/graph_workload.cc  |  10 +-
 src/accl/graph/sega/coalesce_engine.cc |  24 ++--
 8 files changed, 332 insertions(+), 302 deletions(-)
 create mode 100644 configs/accl/bfs.py
 create mode 100644 configs/accl/pr.py
 delete mode 100644 configs/accl/sega-hbm.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
new file mode 100644
index 0000000000..d02faa96ca
--- /dev/null
+++ b/configs/accl/bfs.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=float)
+    argparser.add_argument("init_value", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_bfs_workload(init_addr, init_value)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
new file mode 100644
index 0000000000..59e8b924c6
--- /dev/null
+++ b/configs/accl/pr.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+    exit_event = m5.simulate()
+    print(
+        f"Exited simulation at tick {m5.curTick()} "
+        + f"because {exit_event.getCause()}"
+    )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index db44c63a9a..b943a925c1 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -28,14 +28,20 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.path, args.num_gpts
 
+
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
@@ -59,16 +65,29 @@ def get_inputs():
         print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
         print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_dir}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_dir}/binaries/gpts_{num_gpts}")
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_dir}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_dir}/binaries/gpts_{num_gpts}"
+        )
diff --git a/configs/accl/sega-hbm.py b/configs/accl/sega-hbm.py
deleted file mode 100644
index 1c9276f0a0..0000000000
--- a/configs/accl/sega-hbm.py
+++ /dev/null
@@ -1,178 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret, intlv_low_bit + intlv_bits - 1
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8,
-                                            post_push_wb_queue_size=64
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=512,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = HBMCtrl(dram=HBM_2000_4H_1x64(),
-                                        dram_2=HBM_2000_4H_1x64())
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
-        vertex_ranges, pch_bit = interleave_addresses(
-                                            AddrRange(start=0, size="4GiB"),
-                                            2*num_mpus,
-                                            32
-                                            )
-
-        gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
-            gpt.set_vertex_range([vertex_ranges[i], vertex_ranges[i+num_mpus]])
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument("--verify", type=bool, help="Print final answer")
-
-    args = argparser.parse_args()
-
-    verify = False
-    if not args.verify is None:
-        verify = args.verify
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value, verify
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    # system.create_bfs_workload(init_addr, init_value)
-    system.create_pr_workload(0.2, 0.0000001)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index c50c525297..42c07e2e94 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -24,100 +24,111 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
                 start=plain_range.start,
                 size=plain_range.size(),
                 intlvHighBit=intlv_low_bit + intlv_bits - 1,
                 xorHighBit=0,
                 intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
 
 class GPT(SubSystem):
     def __init__(self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            num_mshr_entry=64,
+            num_tgts_per_mshr=64,
+            max_resp_per_cycle=8,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32
-                                    )
-
-        self.vertex_mem_ctrl = MemCtrl(dram=HBM_1000_4H_1x128(burst_length=2))
-
-        self.edge_mem_ctrl = MemCtrl(dram=DDR4_2400_8x8(
-                                            range=AddrRange(edge_memory_size),
-                                            in_addr_map=False
-                                                    )
-                                    )
+            Xpush_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=512,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
+        )
+
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False
+            )
+        )
 
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.dram.range = vertex_range
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
 
+
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt = GPT("2GiB", cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
         # Creating the interconnect among mpus
@@ -128,31 +139,11 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
index 16985b3537..15e4a6eff2 100644
--- a/configs/accl/synth-graph-gen.py
+++ b/configs/accl/synth-graph-gen.py
@@ -28,15 +28,27 @@
 import argparse
 import subprocess
 
+
 def get_inputs():
     argparser = argparse.ArgumentParser()
-    argparser.add_argument("scale", type=int, help="The scale of the synth graph to generate.")
-    argparser.add_argument("deg", type=int, help="The average degree of the synth graph to generate.")
-    argparser.add_argument("num_gpts", type=int, help="Number gpts to create synth graph binaries for.")
+    argparser.add_argument(
+        "scale", type=int, help="The scale of the synth graph to generate."
+    )
+    argparser.add_argument(
+        "deg",
+        type=int,
+        help="The average degree of the synth graph to generate.",
+    )
+    argparser.add_argument(
+        "num_gpts",
+        type=int,
+        help="Number gpts to create synth graph binaries for.",
+    )
 
     args = argparser.parse_args()
     return args.scale, args.deg, args.num_gpts
 
+
 if __name__ == "__main__":
     scale, deg, num_gpts = get_inputs()
 
@@ -62,18 +74,27 @@ def get_inputs():
         for delete in os.scandir(graph_path):
             os.remove(delete.path)
         print(f"Deleted everything in {graph_path}")
-        subprocess.run([f"{graph_gen}",
-                        f"{scale}",
-                        f"{deg}",
-                        f"{graph_path}/graph_unordered.txt"])
-        print(f"Generated a graph with scale "
-            f"{scale} and deg {deg}")
-        subprocess.run(["python",
-                        f"{graph_sorter}",
-                        f"{graph_path}/graph_unordered.txt",
-                        f"{graph_path}/graph.txt"])
-        print(f"Sorted the graph here {graph_path}/graph_unordered.txt"
-                                f" and saved in {graph_path}/graph.txt")
+        subprocess.run(
+            [
+                f"{graph_gen}",
+                f"{scale}",
+                f"{deg}",
+                f"{graph_path}/graph_unordered.txt",
+            ]
+        )
+        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}/graph_unordered.txt",
+                f"{graph_path}/graph.txt",
+            ]
+        )
+        print(
+            f"Sorted the graph here {graph_path}/graph_unordered.txt"
+            f" and saved in {graph_path}/graph.txt"
+        )
         subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
         print(f"Deleted {graph_path}/graph_unordered.txt")
 
@@ -88,16 +109,31 @@ def get_inputs():
         print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
 
     expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all([binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}") for binary in expected_bins]):
-        print(f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}")
+    if not all(
+        [
+            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
+            for binary in expected_bins
+        ]
+    ):
+        print(
+            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
+        )
         for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
             os.remove(delete.path)
-        print(f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}")
-        subprocess.run([f"{graph_reader}" ,
-                        f"{graph_path}/graph.txt",
-                        "false",
-                        f"{num_gpts}",
-                        "32",
-                        f"{graph_path}/binaries/gpts_{num_gpts}"])
-        print(f"Created the graph binaries in "
-                f"{graph_path}/binaries/gpts_{num_gpts}")
+        print(
+            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
+        )
+        subprocess.run(
+            [
+                f"{graph_reader}",
+                f"{graph_path}/graph.txt",
+                "false",
+                f"{num_gpts}",
+                "32",
+                f"{graph_path}/binaries/gpts_{num_gpts}",
+            ]
+        )
+        print(
+            f"Created the graph binaries in "
+            f"{graph_path}/binaries/gpts_{num_gpts}"
+        )
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9f7e5fc4c5..e362d605c0 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -174,7 +174,9 @@ uint32_t
 PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(1);
+    float weight_float = 1.0;
+    float delta = alpha * value_float * weight_float;
+
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }
 
@@ -198,14 +200,14 @@ PRWorkload::preWBApply(WorkListItem& wl)
 
 std::tuple<uint32_t, bool, bool>
 PRWorkload::prePushApply(WorkListItem& wl)
-{ 
+{
     if (applyCondition(wl)) {
         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
         float prop_float = writeToFloat<uint32_t>(wl.prop);
         float delta = (temp_float - prop_float) / wl.degree;
-        std::cout << "PRWorkload: delta: " << delta << std::endl;
+        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
         wl.prop = wl.tempProp;
-        return std::make_tuple(delta, true, true);
+        return std::make_tuple(delta_uint, true, true);
     }
     return std::make_tuple(0, false, false);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0d1eecf43f..2f6555602c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -196,7 +196,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         cacheBlocks[block_index].pendingApply = false;
         cacheBlocks[block_index].pendingWB = false;
         // HACK: If a read happens on the same cycle as another operation such
-        // apply setLastChangedTick to half a cycle later so that operations
+        // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
         // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
         cacheBlocks[block_index].lastChangedTick =
@@ -478,7 +478,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 bool do_push, do_wb_v;
                 std::tie(delta, do_push, do_wb_v) =
                                         graphWorkload->prePushApply(items[i]);
-                std::cout << "CoalesceEngine: delta: " << delta << std::endl;
                 do_wb |= do_wb_v;
                 if (do_push) {
                     owner->recvVertexPush(vertex_addr, delta,
@@ -517,7 +516,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         cacheBlocks[block_index].valid = true;
         cacheBlocks[block_index].needsWB |= do_wb;
         cacheBlocks[block_index].pendingData = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() + (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -564,7 +566,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                         responseQueue.size());
             // TODO: Add a stat to count the number of WLItems that have been touched.
             cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            cacheBlocks[block_index].lastChangedTick = curTick();
+            // cacheBlocks[block_index].lastChangedTick = curTick();
             DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
             it = MSHR[block_index].erase(it);
@@ -608,8 +610,8 @@ CoalesceEngine::processNextResponseEvent()
         num_responses_sent++;
         DPRINTF(CoalesceEngine,
                     "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__, 
-                    graphWorkload->printWorkListItem(worklist_response), 
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
                     addr_response);
 
         responseQueue.pop_front();
@@ -652,7 +654,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__, 
+                "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
     // Desing does not allow for write misses for now.
     assert(cacheBlocks[block_index].addr == aligned_addr);
@@ -874,8 +876,11 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
-    assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-    //
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
 
     assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
@@ -1124,7 +1129,6 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             bool do_push, do_wb;
             std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
                                     cacheBlocks[block_index].items[wl_offset]);
-            std::cout << "CoalesceEngine: delta: " << delta << std::endl;
             cacheBlocks[block_index].needsWB |= do_wb;
             if (do_push) {
                 owner->recvVertexPush(vertex_addr, delta,

From ffbef8e2cf85c635d8814ccf1951ea145a968fb6 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 26 Oct 2022 07:46:18 -0700
Subject: [PATCH 206/287] Fixing typos.

---
 configs/accl/bfs.py                   | 8 ++++----
 configs/accl/sega.py                  | 2 +-
 src/accl/graph/base/graph_workload.cc | 1 -
 3 files changed, 5 insertions(+), 6 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index d02faa96ca..fc32b96642 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -37,8 +37,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=float)
-    argparser.add_argument("init_value", type=float)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -54,8 +54,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
-        args.alpha,
-        args.threshold,
+        args.init_addr,
+        args.init_value,
         args.verify,
     )
 
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 42c07e2e94..0f4b133791 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -59,7 +59,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
-            Xpush_req_queue_size=32,
+            push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=512,
             update_queue_size=32,
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e362d605c0..44136cb4c1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -175,7 +175,6 @@ PRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
     float weight_float = 1.0;
-    float delta = alpha * value_float * weight_float;
 
     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
 }

From fe146055cc230e532d878a66cd0c1577a81234f3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 27 Oct 2022 14:24:18 -0700
Subject: [PATCH 207/287] Adding sample script.

---
 configs/accl/pr-sample.py              | 109 +++++++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine.cc |   2 +-
 2 files changed, 110 insertions(+), 1 deletion(-)
 create mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
new file mode 100644
index 0000000000..ac3616dc84
--- /dev/null
+++ b/configs/accl/pr-sample.py
@@ -0,0 +1,109 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from sega import SEGA
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 10us",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.verify,
+        args.sample,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        verify,
+        sample,
+    ) = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.create_pr_workload(alpha, threshold)
+
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 2f6555602c..1dbe2a0d56 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -519,7 +519,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // HACK: In case processNextRead is called on the same tick as curTick
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
-                                        curTick() + (Tick) (clockPeriod() / 2);
+                                        curTick() - (Tick) (clockPeriod() / 2);
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);

From 151a02fbe697abb0713b99c0ff72fa4f16bf63b1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 28 Oct 2022 11:02:32 -0700
Subject: [PATCH 208/287] Fixing sim performance issue.

---
 src/accl/graph/base/graph_workload.cc  |  8 ++++++--
 src/accl/graph/base/graph_workload.hh  |  9 ++++++---
 src/accl/graph/sega/coalesce_engine.cc |  7 +++++--
 src/accl/graph/sega/coalesce_engine.hh | 18 ++++++++++++++++--
 4 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 44136cb4c1..07accff44f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -68,7 +68,8 @@ BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size)
 void
 BFSWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     if (pkt->getAddr() == initAddrBase) {
         WorkListItem items[numElementsPerLine];
@@ -80,6 +81,7 @@ BFSWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[initIndex].degree > 0) {
             needsPush[bit_index_base + initIndex] = 1;
             activeBits.push_back(bit_index_base + initIndex);
+            _workCount++;
         }
 
         pkt->deleteData();
@@ -144,7 +146,8 @@ PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
 void
 PRWorkload::init(PacketPtr pkt, int bit_index_base,
                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits)
+                std::deque<int>& activeBits,
+                int& _workCount)
 {
     WorkListItem items[numElementsPerLine];
 
@@ -155,6 +158,7 @@ PRWorkload::init(PacketPtr pkt, int bit_index_base,
         if (items[i].degree > 0) {
             needsPush[bit_index_base + i] = 1;
             activeBits.push_back(bit_index_base + i);
+            _workCount++;
         }
     }
     pkt->deleteData();
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index c391a80c23..6bbc4935c2 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -48,7 +48,8 @@ class GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits) = 0;
+                    std::deque<int>& activeBits,
+                    int& _workCount) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual bool applyCondition(WorkListItem wl) = 0;
@@ -73,7 +74,8 @@ class BFSWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
@@ -99,7 +101,8 @@ class PRWorkload : public GraphWorkload
 
     virtual void init(PacketPtr pkt, int bit_index_base,
                     std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits);
+                    std::deque<int>& activeBits,
+                    int& _workCount);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual bool applyCondition(WorkListItem wl);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 1dbe2a0d56..38f05f937a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -47,7 +47,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
     numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
     numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
     maxPotentialPostPushWB(0),
     nextMemoryEvent([this] {
@@ -102,7 +102,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
     } else {
         // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits);
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
 }
@@ -473,6 +473,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (vertex_send_mask != 0) {
                 assert(needsPush[it + i] == 1);
                 needsPush[it + i] = 0;
+                _workCount--;
 
                 uint32_t delta;
                 bool do_push, do_wb_v;
@@ -784,6 +785,7 @@ CoalesceEngine::processNextPreWBApplyEvent()
                 int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
                 if (needsPush[bit_index_base + index] == 0) {
                     needsPush[bit_index_base + index] = 1;
+                    _workCount++;
                     activeBits.push_back(bit_index_base + index);
                     if (!owner->running()) {
                         owner->start();
@@ -1124,6 +1126,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             int slice_base_index = getBitIndexBase(addr);
 
             needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
 
             uint32_t delta;
             bool do_push, do_wb;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c8fec38e5b..64c5c4af46 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -52,6 +52,17 @@ enum BitStatus
     NUM_STATUS
 };
 
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_PRE_WB_APPLY,
+    PENDING_WB,
+    NUM_CACHE_STATE
+};
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -69,6 +80,7 @@ class CoalesceEngine : public BaseMemoryEngine
         bool pendingApply;
         bool pendingWB;
         Tick lastChangedTick;
+        CacheState state;
         // TODO: This might be useful in the future
         // Tick lastWLWriteTick;
         Block() {}
@@ -81,7 +93,8 @@ class CoalesceEngine : public BaseMemoryEngine
           pendingData(false),
           pendingApply(false),
           pendingWB(false),
-          lastChangedTick(0)
+          lastChangedTick(0),
+          state(CacheState::INVALID)
         {
           items = new WorkListItem [num_elements];
         }
@@ -116,6 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
+    int _workCount;
     int numPullsReceived;
     UniqueFIFO<int> applyQueue;
     std::bitset<MAX_BITVECTOR_SIZE> needsPush;
@@ -206,7 +220,7 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    int workCount() { return needsPush.count(); }
+    int workCount() { return _workCount; }
     void recvVertexPull();
 
     bool done();

From 82d076c4bc2efca79614cb40f08ec080bd8ac7ac Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 31 Oct 2022 09:53:00 -0700
Subject: [PATCH 209/287] Fixing write miss issue.

---
 src/accl/graph/sega/coalesce_engine.cc | 92 ++++++++++++++------------
 src/accl/graph/sega/coalesce_engine.hh | 30 ++++++++-
 2 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 38f05f937a..7a064c1c2f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -495,6 +495,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         maxPotentialPostPushWB--;
     }
 
+    bool cache_wb = false;
     if (cacheBlocks[block_index].addr == addr) {
         DPRINTF(CoalesceEngine, "%s: Received read response to "
                         "fill cacheBlocks[%d].\n", __func__, block_index);
@@ -521,6 +522,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // and is scheduled to read to the same cacheBlocks[block_index]
         cacheBlocks[block_index].lastChangedTick =
                                         curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
     } else if (do_wb) {
         PacketPtr wb_pkt = createWritePacket(
                                 addr, peerMemoryAtomSize, (uint8_t*) items);
@@ -537,42 +539,44 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         // TODO: Add a stat to count this.
         // FIXME: This is not a totally wasteful read. e.g. all reads
         // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: Totally wasteful read.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
     }
 
-    for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-        Addr miss_addr = *it;
-        Addr aligned_miss_addr =
-            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-        if (aligned_miss_addr == addr) {
-            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                        "cacheBlocks[%d] can be serviced with the received "
-                        "packet.\n",__func__, miss_addr, block_index);
-            // TODO: Make this block of code into a function
-            responseQueue.push_back(std::make_tuple(miss_addr,
-                    cacheBlocks[block_index].items[wl_offset], curTick()));
-            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, miss_addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                        "to responseQueue. responseQueue.size = %d.\n",
-                        __func__, addr,
-                        graphWorkload->printWorkListItem(
-                            cacheBlocks[block_index].items[wl_offset]),
-                        responseQueue.size());
-            // TODO: Add a stat to count the number of WLItems that have been touched.
-            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-            // cacheBlocks[block_index].lastChangedTick = curTick();
-            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-            it = MSHR[block_index].erase(it);
-        } else {
-            it++;
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
         }
     }
 
@@ -1045,7 +1049,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<BitStatus, Addr, int>
+std::tuple<WorkLocation, Addr, int>
 CoalesceEngine::getOptimalPullAddr()
 {
     int visited_bits = 0;
@@ -1066,7 +1070,7 @@ CoalesceEngine::getOptimalPullAddr()
             assert(vertex_send_mask == 0);
             activeBits.pop_front();
             return std::make_tuple(
-                                BitStatus::PENDING_READ, addr, index_offset);
+                                WorkLocation::PENDING_READ, addr, index_offset);
         } else {
             // Only if it is in cache and it is in idle state.
             if ((cacheBlocks[block_index].addr == addr) &&
@@ -1078,12 +1082,12 @@ CoalesceEngine::getOptimalPullAddr()
                 assert(!cacheBlocks[block_index].pendingData);
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_CACHE, block_index, index_offset);
+                            WorkLocation::IN_CACHE, block_index, index_offset);
             // Otherwise if it is in memory
             } else if ((cacheBlocks[block_index].addr != addr)) {
                 activeBits.pop_front();
                 return std::make_tuple(
-                            BitStatus::IN_MEMORY, addr, index_offset);
+                            WorkLocation::IN_MEMORY, addr, index_offset);
             }
         }
         activeBits.pop_front();
@@ -1091,20 +1095,20 @@ CoalesceEngine::getOptimalPullAddr()
         visited_bits++;
     }
 
-    return std::make_tuple(BitStatus::GARBAGE, 0, 0);
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    BitStatus bit_status;
+    WorkLocation bit_status;
     Addr location;
     int offset;
 
     std::tie(bit_status, location, offset) = getOptimalPullAddr();
 
-    if (bit_status != BitStatus::GARBAGE) {
-        if (bit_status == BitStatus::PENDING_READ) {
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
             // renaming the outputs to thier local names.
             Addr addr = location;
             int index_offset = offset;
@@ -1116,7 +1120,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             pendingVertexPullReads[addr] = send_mask;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_CACHE) {
+        if (bit_status == WorkLocation::IN_CACHE) {
             // renaming the outputs to their local names.
             int block_index = (int) location;
             int wl_offset = offset;
@@ -1145,7 +1149,7 @@ CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
             stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             numPullsReceived--;
         }
-        if (bit_status == BitStatus::IN_MEMORY) {
+        if (bit_status == WorkLocation::IN_MEMORY) {
             if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
                 Addr addr = location;
                 int index_offset = offset;
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 64c5c4af46..05e268270a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-enum BitStatus
+enum WorkLocation
 {
     PENDING_READ,
     IN_CACHE,
@@ -65,6 +65,32 @@ enum CacheState
 
 class MPU;
 
+
+// TODO: Add active bit to WorkListItem class. Check active bit before activate
+// Only activate if necessary and not active before.
+class WorkDirectory
+{
+  private:
+    CoalesceEngine* owner;
+    Addr memoryAtomSize;
+    int atomBlockSize;
+    size_t elementSize;
+
+    int _workCount;
+  public:
+    AddrRange memoryRange;
+    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
+        memoryAtomSize(atom_size), atomBlockSize(block_size),
+        elementSize(element_size), _workCount(0)
+    {}
+
+    void activate(Addr addr);
+    void deactivate(Addr addr);
+    int workCount();
+    std::tuple<WorkLocation, Addr> getNextWork();
+
+};
+
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -140,7 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<BitStatus, Addr, int> getOptimalPullAddr();
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
 
     int maxPotentialPostPushWB;
     // A map from addr to sendMask. sendMask determines which bytes to

From f217715d8eae9774027635e6652755cdeaab0c00 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 1 Nov 2022 00:15:16 -0700
Subject: [PATCH 210/287] Restructuring the cache.

---
 src/accl/graph/base/data_structs.hh        |   17 +-
 src/accl/graph/sega/CoalesceEngine.py      |    2 -
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 +
 src/accl/graph/sega/coalesce_engine.cc     |  553 +++------
 src/accl/graph/sega/coalesce_engine.hh     |  107 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 ++++++++++++++++++++
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ++++
 7 files changed, 1834 insertions(+), 421 deletions(-)
 create mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 create mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index d9028e2f10..070e635736 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -45,29 +45,33 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 31;
+    bool active: 1;
 
     std::string to_string()
     {
         return csprintf(
-        "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-        tempProp, prop, degree, edgeIndex);
+                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
+                active ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        active(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
+                uint32_t edge_index, uint32_t degree, bool active):
         tempProp(temp_prop),
         prop(prop),
+        edgeIndex(edge_index),
         degree(degree),
-        edgeIndex(edge_index)
+        active(active)
     {}
 
 };
@@ -88,7 +92,6 @@ struct __attribute__ ((packed)) Edge
         weight(weight),
         neighbor(neighbor)
     {}
-
 };
 
 static_assert(isPowerOf2(sizeof(WorkListItem)));
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 1fd3b968c5..8ec9214b49 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -38,8 +38,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     num_mshr_entry = Param.Int("Number of MSHR entries.")
 
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
 
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
new file mode 100644
index 0000000000..1fd3b968c5
--- /dev/null
+++ b/src/accl/graph/sega/CoalesceEngine_bak.py
@@ -0,0 +1,50 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
+
+class CoalesceEngine(BaseMemoryEngine):
+    type = 'CoalesceEngine'
+    cxx_header = "accl/graph/sega/coalesce_engine.hh"
+    cxx_class = 'gem5::CoalesceEngine'
+
+    cache_size = Param.MemorySize("Size of the internal SRAM array.")
+
+    num_mshr_entry = Param.Int("Number of MSHR entries.")
+
+    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
+
+    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
+                                "requestor in each cycle. Used to limit b/w.")
+
+    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
+                                "apply process for applications that require "
+                                "the apply process to happen exactly before "
+                                "pushing the edgePointer to the PushEngine.")
+
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 7a064c1c2f..66ff66c068 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
+    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
+    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    postPushWBQueueSize(params.post_push_wb_queue_size),
+    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextPreWBApplyEvent([this] {
         processNextPreWBApplyEvent();
         }, name() + ".nextPreWBApplyEvent"),
+    nextPrePushApplyEvent([this] {
+        processNextPrePushApplyEvent();
+        }, name() + ".nextPrePushApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -66,7 +69,6 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-    needsPush.reset();
 }
 
 void
@@ -83,15 +85,10 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
+        // TODO: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
 
             pkt->makeResponse();
             pkt->setDataFromBlock(
@@ -100,8 +97,8 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        // TODO: Add and implement init function for GraphWorkload.
         int bit_index_base = getBitIndexBase(pkt->getAddr());
+        // FIXME: Pass workdirectory to graphworkload.init
         graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
         memPort.sendFunctional(pkt);
     }
@@ -110,6 +107,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 bool
 CoalesceEngine::done()
 {
+    // FIXME: Fix this later
     return applyQueue.empty() && needsPush.none() &&
         memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
 }
@@ -123,6 +121,8 @@ CoalesceEngine::getBlockIndex(Addr addr)
     return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
 }
 
+// FIXME: This and the next function should be moved to the
+// WorkDirectory.
 // addr should be aligned to peerMemoryAtomSize
 int
 CoalesceEngine::getBitIndexBase(Addr addr)
@@ -134,6 +134,7 @@ CoalesceEngine::getBitIndexBase(Addr addr)
     return atom_index * block_bits;
 }
 
+// FIXME: Read FIXME: Above
 // index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
 Addr
 CoalesceEngine::getBlockAddrFromBitIndex(int index)
@@ -161,17 +162,10 @@ CoalesceEngine::recvWLRead(Addr addr)
 
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
+        // Hit
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
+        assert(cacheBlocks[block_index].state != CacheState::INVALID);
         responseQueue.push_back(std::make_tuple(
             addr, cacheBlocks[block_index].items[wl_offset], curTick()));
 
@@ -189,12 +183,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 responseQueue.size());
         // TODO: Stat to count the number of WLItems that have been touched.
         cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].state = CacheState::BUSY;
         // HACK: If a read happens on the same cycle as another operation such
         // as apply set lastChangedTick to half a cycle later so that operation
         // scheduled by the original operation (apply in this example) are
@@ -210,34 +199,20 @@ CoalesceEngine::recvWLRead(Addr addr)
         stats.numVertexReads++;
         return true;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
+                (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
                                                         __func__, addr);
         stats.readHitUnderMisses++;
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
+        assert(!cacheBlocks[block_index].dirty);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
         MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
@@ -245,195 +220,52 @@ CoalesceEngine::recvWLRead(Addr addr)
         return true;
     } else {
         // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
         assert(cacheBlocks[block_index].addr != aligned_addr);
         assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+
+        if (cacheBlocks[block_index].state != CacheState::INVALID) {
+            // conflict miss
+            DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
+                "Addr: %lu.\n", __func__, addr, cacheBlocks[block_index].addr);
+            cacheBlocks[block_index].hasConflict = true;
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                if (cacheBlocks[block_index].dirty) {
+                    cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
+                            processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
+                } else {
+                    // NOTE: move the cache block to invalid state
+                    // FIXME: Fix the issue below.
+                    // May need to activate tracking for this
+                    cacheBlocks[block_index].reset();
                 }
             }
+            // return int instead of bool to tell WLEngine to whether
+            // roll the first entry in the queue.
+            return false;
         } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
+            // cold miss
+            assert(MSHR.find(block_index) == MSHR.end());
+            if (MSHR.size() < numMSHREntries) {
+                cacheBlocks[block_index].addr = aligned_addr;
+                cacheBlocks[block_index].busyMask = 0;
+                cacheBlocks[block_index].valid = false;
+                cacheBlocks[block_index].dirty = false;
+                cacheBlocks[block_index].hasConflict = false;
+                cacheBlocks[block_index].needsPreWBApply = false;
+                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+                cacheBlocks[block_index].lastChangedTick = curTick();
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextRead(block_index, schedule_tick);
+                    }, block_index, curTick());
+                return true;
+            } else {
                 return false;
             }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
         }
     }
 }
@@ -589,8 +421,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         schedule(nextResponseEvent, nextCycle());
     }
 
-
-    // TODO: Probably check for done here too.
     delete pkt;
     return true;
 }
@@ -771,15 +601,53 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextPreWBApplyEvent()
 {
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+    int block_index = preWBApplyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
                 "cacheBlock[%d] to be applied.\n", __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
             __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].needsPreWBApply);
+        bool block_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            block_active |= active;
+            if (active) {
+                // cacheWorkCount++;
+                // FUTUREME: When pulling from activeCacheBlocks, in case we
+                // face a block that is not in idle state, we basically pop
+                // that entry and push it to the back. We only delete entries
+                // in this buffer if pushed or evicted.
+                activeCacheBlocks.push_back(block_index);
+            }
+        }
+        if (block_active && !owner->running()) {
+            owner->start();
+        }
+
+        cacheBlocks[block_index].needsPreWBApply = false;
+        if (cacheBlocks[block_index].hasConflict) {
+            if (cacheBlocks[block_index].dirty) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int block_index, Tick schedule_tick) {
+                        processNextWriteBack(block_index, schedule_tick);
+                    }, block_index, curTick());
+            } else {
+                // FIXME: Solve below issue.
+                // Not dirty but could be active still.
+                // need to activate tracking
+                cacheBlocks[block_index].reset();
+            }
+        } else {
+            cacheBlocks[block_index].state = CacheState::IDLE;
+        }
+        cacheBlocks[block_index].lastChangedTick = curTick();
+    } else {
+
+    }
 
     if (cacheBlocks[block_index].pendingApply) {
         assert(cacheBlocks[block_index].busyMask == 0);
@@ -883,77 +751,85 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         __func__, block_index, cacheBlocks[block_index].to_string());
     // A cache block should not be touched while it's waiting for data.
     // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
+    // TODO: Figure out if this is still necessary.
     if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
         return;
     }
 
-    assert(!cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(!cacheBlocks[block_index].valid);
+    assert(!cacheBlocks[block_index].dirty);
+    assert(!cacheBlocks[block_index].needsPreWBApply);
+    assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
+
+    // NOTE: Search postPushWBQueue
     for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
+            cacheBlocks[block_index].dirty = true;
+            need_send_pkt = false;
             postPushWBQueue.erase(wb);
+        }
+    }
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+        PacketPtr ab_pkt = std::get<0>(*ab);
+        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+            ab_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
             need_send_pkt = false;
+            activeBuffer.erase(ab);
         }
     }
+    if (!need_send_pkt) {
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsPreWBApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(aligned_miss_addr == cacheBlocks[block_index].addr);
+            int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+            DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                        "cacheBlocks[%d] can be serviced with the received "
+                        "packet.\n",__func__, miss_addr, block_index);
+            // TODO: Make this block of code into a function
+            responseQueue.push_back(std::make_tuple(miss_addr,
+                    cacheBlocks[block_index].items[wl_offset], curTick()));
+            DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                        "to responseQueue. responseQueue.size = %d.\n",
+                        __func__, miss_addr,
+                        graphWorkload->printWorkListItem(
+                            cacheBlocks[block_index].items[wl_offset]),
+                        responseQueue.size());
+            cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                    __func__, block_index,
+                    cacheBlocks[block_index].to_string());
+            it = MSHR[block_index].erase(it);
+        }
+        assert(MSHR[block_index].empty());
+        MSHR.erase(block_index);
+        if ((!nextResponseEvent.scheduled()) &&
+            (!responseQueue.empty())) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        cacheBlocks[block_index].state = CacheState::BUSY;
+    }
 
     if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
+                                                pendingVertexPullReads.end()) {
         need_send_pkt = false;
     }
 
@@ -964,11 +840,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
         onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
     }
 }
 
@@ -979,19 +850,27 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                                                 __func__, block_index);
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
+
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
         assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
+        assert(cacheBlocks[block_index].dirty);
+        assert(cacheBlocks[block_index].hasConflict);
+        assert(!cacheBlocks[block_index].needsPreWBApply);
+        assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
+        Addr base_addr = cacheBlocks[block_index].addr;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            if (cacheBlocks[block_index].items[index].active) {
+                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
+                // NOTE: Implement this
+                // workdir.activate()
+                // cacheWorkCount--;
+            }
+        }
+        if (activeCacheBlocks.find(block_index)) {
+            activeCacheBlocks.erase(block_index);
+        }
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -999,30 +878,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
+        cacheBlocks[block_index].reset();
         DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
                 " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
@@ -1049,55 +905,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     }
 }
 
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
@@ -1262,8 +1069,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
     ADD_STAT(bitvectorLength, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 05e268270a..8da67c7b43 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -60,9 +60,26 @@ enum CacheState
     IDLE,
     PENDING_PRE_WB_APPLY,
     PENDING_WB,
+    PENDING_PRE_PUSH_APPLY,
     NUM_CACHE_STATE
 };
 
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_PRE_WB_APPLY",
+    "PENDING_WB",
+    "PENDING_PRE_PUSH_APPLY"
+};
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH
+};
+
 class MPU;
 
 
@@ -71,7 +88,6 @@ class MPU;
 class WorkDirectory
 {
   private:
-    CoalesceEngine* owner;
     Addr memoryAtomSize;
     int atomBlockSize;
     size_t elementSize;
@@ -88,7 +104,6 @@ class WorkDirectory
     void deactivate(Addr addr);
     int workCount();
     std::tuple<WorkLocation, Addr> getNextWork();
-
 };
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -100,47 +115,54 @@ class CoalesceEngine : public BaseMemoryEngine
         Addr addr;
         uint64_t busyMask;
         bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
+        bool dirty;
+        bool hasConflict;
+        bool needsPreWBApply;
         CacheState state;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
+        Tick lastChangedTick;
         Block() {}
         Block(int num_elements):
           addr(-1),
           busyMask(0),
           valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-          state(CacheState::INVALID)
+          dirty(false),
+          hasConflict(false),
+          needsPreWBApply(false),
+          state(CacheState::INVALID),
+          lastChangedTick(0)
         {
           items = new WorkListItem [num_elements];
         }
 
+        void reset() {
+            addr = -1;
+            busyMask = 0;
+            valid = false;
+            dirty = false;
+            hasConflict = false;
+            needsPreWBApply = false;
+            state = CacheState::INVALID;
+            lastChangedTick = 0;
+        }
+
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
+                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
+                "state: %s, lastChangedTick: %lu}", addr, busyMask,
+                valid ? "true" : "false", dirty ? "true" : "false",
+                hasConflict ? "true" : "false",
+                needsPreWBApply ? "true" : "false",
+                cacheStateStrings[state], lastChangedTick);
         }
     };
 
-    struct SenderState : public Packet::SenderState
+    struct ReadPurpose : public Packet::SenderState
     {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
+      ReadDestination _dest;
+      ReadPurpose(ReadDestination dest): _dest(dest) {}
+      ReadDestination dest() { return _dest; }
     };
+
     MPU* owner;
     GraphWorkload* graphWorkload;
 
@@ -150,28 +172,33 @@ class CoalesceEngine : public BaseMemoryEngine
 
     int onTheFlyReqs;
     int numMSHREntries;
-    int numTgtsPerMSHR;
     std::unordered_map<int, std::vector<Addr>> MSHR;
+
+    // Response route to WLEngine
     int maxRespPerCycle;
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
-    int _workCount;
+    // Tracking work in cache
+    int cacheWorkCount;
     int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
+    UniqueFIFO<int> preWBApplyQueue;
+    // NOTE: Remember to erase from this upon eviction from cache
+    UniqueFIFO<int> activeCacheBlocks;
+
+    int pendingPullReads;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    int activeBufferSize;
     int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
     int getBlockIndex(Addr addr);
+    // TODO: Should be moved to WorkDirectory
     int getBitIndexBase(Addr addr);
     Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -188,6 +215,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextPreWBApplyEvent;
     void processNextPreWBApplyEvent();
 
+    EventFunctionWrapper nextPrePushApplyEvent;
+    void processNextPrePushApplyEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
@@ -223,7 +253,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram mshrEntryLength;
         statistics::Histogram bitvectorLength;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
@@ -246,6 +275,8 @@ class CoalesceEngine : public BaseMemoryEngine
     bool recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
+    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
+    // workcount.
     int workCount() { return _workCount; }
     void recvVertexPull();
 
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
new file mode 100644
index 0000000000..7a064c1c2f
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.cc
@@ -0,0 +1,1308 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/coalesce_engine.hh"
+
+#include <bitset>
+
+#include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
+#include "debug/CacheBlockState.hh"
+#include "debug/CoalesceEngine.hh"
+#include "debug/SEGAStructureSize.hh"
+#include "mem/packet_access.hh"
+#include "sim/sim_exit.hh"
+
+namespace gem5
+{
+
+CoalesceEngine::CoalesceEngine(const Params &params):
+    BaseMemoryEngine(params),
+    numLines((int) (params.cache_size / peerMemoryAtomSize)),
+    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
+    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    numTgtsPerMSHR(params.num_tgts_per_mshr),
+    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
+    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
+    maxPotentialPostPushWB(0),
+    nextMemoryEvent([this] {
+        processNextMemoryEvent();
+        }, name() + ".nextMemoryEvent"),
+    nextResponseEvent([this] {
+        processNextResponseEvent();
+        }, name() + ".nextResponseEvent"),
+    nextPreWBApplyEvent([this] {
+        processNextPreWBApplyEvent();
+        }, name() + ".nextPreWBApplyEvent"),
+    stats(*this)
+{
+    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
+    cacheBlocks = new Block [numLines];
+    for (int i = 0; i < numLines; i++) {
+        cacheBlocks[i] = Block(numElementsPerLine);
+    }
+    needsPush.reset();
+}
+
+void
+CoalesceEngine::registerMPU(MPU* mpu)
+{
+    owner = mpu;
+}
+
+void
+CoalesceEngine::recvFunctional(PacketPtr pkt)
+{
+    if (pkt->isRead()) {
+        assert(pkt->getSize() == peerMemoryAtomSize);
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+
+        if ((cacheBlocks[block_index].addr == addr) &&
+            (cacheBlocks[block_index].valid)) {
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].needsApply);
+            // NOTE: No need to check needsWB because there might be entries
+            // that have been updated and not written back in the cache.
+            // assert(!cacheBlocks[block_index].needsWB);
+            assert(!cacheBlocks[block_index].pendingApply);
+            assert(!cacheBlocks[block_index].pendingWB);
+
+            pkt->makeResponse();
+            pkt->setDataFromBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+        } else {
+            memPort.sendFunctional(pkt);
+        }
+    } else {
+        // TODO: Add and implement init function for GraphWorkload.
+        int bit_index_base = getBitIndexBase(pkt->getAddr());
+        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        memPort.sendFunctional(pkt);
+    }
+}
+
+bool
+CoalesceEngine::done()
+{
+    return applyQueue.empty() && needsPush.none() &&
+        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBitIndexBase(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
+    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
+    return atom_index * block_bits;
+}
+
+// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
+Addr
+CoalesceEngine::getBlockAddrFromBitIndex(int index)
+{
+    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
+    Addr trimmed_addr = index * sizeof(WorkListItem);
+    return peerMemoryRange.addIntlvBits(trimmed_addr);
+}
+
+bool
+CoalesceEngine::recvWLRead(Addr addr)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    assert(aligned_addr % peerMemoryAtomSize == 0);
+    int block_index = getBlockIndex(aligned_addr);
+    assert(block_index < numLines);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    assert(wl_offset < numElementsPerLine);
+    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
+                        "This request maps to cacheBlocks[%d], aligned_addr: "
+                        "%lu, and wl_offset: %d.\n", __func__, addr,
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    if ((cacheBlocks[block_index].addr == aligned_addr) &&
+        (cacheBlocks[block_index].valid)) {
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
+        stats.readHits++;
+        assert(!cacheBlocks[block_index].pendingData);
+        // No cache block could be in pendingApply and pendingWB at the
+        // same time.
+        assert(!(cacheBlocks[block_index].pendingApply &&
+                cacheBlocks[block_index].pendingWB));
+        // Hit
+        // TODO: Add a hit latency as a param for this object.
+        // Can't just schedule the nextResponseEvent for latency cycles in
+        // the future.
+        responseQueue.push_back(std::make_tuple(
+            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
+
+        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                        cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                "to responseQueue. responseQueue.size = %d.\n",
+                __func__, addr,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]),
+                responseQueue.size());
+        // TODO: Stat to count the number of WLItems that have been touched.
+        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+        // If they are scheduled for apply and WB those schedules should be
+        // discarded. Since there is no easy way to take items out of the
+        // function queue. Those functions check for their respective bits
+        // and skip the process if the respective bit is set to false.
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        // HACK: If a read happens on the same cycle as another operation such
+        // as apply set lastChangedTick to half a cycle later so that operation
+        // scheduled by the original operation (apply in this example) are
+        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
+        cacheBlocks[block_index].lastChangedTick =
+                                    curTick() + (Tick) (clockPeriod() / 2);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+
+        if (!nextResponseEvent.scheduled()) {
+            schedule(nextResponseEvent, nextCycle());
+        }
+        stats.numVertexReads++;
+        return true;
+    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
+                (cacheBlocks[block_index].pendingData)) {
+        // Hit under miss
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
+                                                        __func__, addr);
+        stats.readHitUnderMisses++;
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+        if (MSHR[block_index].size() == numTgtsPerMSHR) {
+            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                        "cacheBlocks[%d]. Rejecting request.\n",
+                                        __func__, block_index);
+            stats.mshrTargetShortage++;
+            return false;
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
+                            "cacheBlocks[%d].\n", __func__, block_index);
+        }
+        MSHR[block_index].push_back(addr);
+        stats.mshrEntryLength.sample(MSHR[block_index].size());
+        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                "for cacheBlocks[%d].\n", __func__, addr, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+        stats.numVertexReads++;
+        return true;
+    } else {
+        // miss
+        // FIXME: Make this assert work. It will break if the cache block
+        // is cold and addr or aligned_addr is 0. It fails because cache block
+        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
+        // So you can not initialized addr to -1.
+        assert(cacheBlocks[block_index].addr != aligned_addr);
+        assert(MSHR.size() <= numMSHREntries);
+        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
+        if (MSHR.find(block_index) == MSHR.end()) {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
+                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
+            if (MSHR.size() == numMSHREntries) {
+                // Out of MSHR entries
+                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
+                                "Rejecting request.\n", __func__);
+                // TODO: Break out read rejections into more than one stat
+                // based on the cause of the rejection
+                stats.mshrEntryShortage++;
+                return false;
+            } else {
+                DPRINTF(CoalesceEngine,  "%s: MSHR "
+                    "entries available.\n", __func__);
+                if ((cacheBlocks[block_index].valid) ||
+                    (cacheBlocks[block_index].pendingData)) {
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
+                                "with Addr: %lu.\n", __func__, addr,
+                                cacheBlocks[block_index].addr);
+                    if ((cacheBlocks[block_index].valid) &&
+                        (cacheBlocks[block_index].busyMask == 0) &&
+                        (!cacheBlocks[block_index].pendingApply) &&
+                        (!cacheBlocks[block_index].pendingWB)) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                                    "idle state.\n", __func__, block_index);
+                        // We're in idle state
+                        // Idle: valid && !pendingApply && !pendingWB;
+                        // Note 0: needsApply has to be false. Because
+                        // A cache line enters the idle state from two
+                        // other states. First a busy state that does not
+                        // need apply (needsApply is already false) or
+                        // from pendingApplyState after being applied which
+                        // clears the needsApply bit. needsApply is useful
+                        // when a cache block has transitioned from
+                        // pendingApply to busy without the apply happening.
+                        // Note 1: pendingData does not have to be evaluated
+                        // becuase pendingData is cleared when data
+                        // arrives from the memory and valid does not
+                        // denote cleanliness of the line. Rather it
+                        // is used to differentiate between empty blocks
+                        // and the blocks that have data from memory.
+                        // pendingData denotes the transient state between
+                        // getting a miss and getting the data for that miss.
+                        // valid basically means that the data in the cache
+                        // could be used to respond to read/write requests.
+                        assert(!cacheBlocks[block_index].needsApply);
+                        assert(!cacheBlocks[block_index].pendingData);
+                        // There are no conflicts in idle state.
+                        assert(MSHR.find(block_index) == MSHR.end());
+                        if (cacheBlocks[block_index].needsWB) {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
+                            "to be written back.\n", __func__, block_index);
+                            cacheBlocks[block_index].pendingWB = true;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                processNextWriteBack(block_index, schedule_tick);
+                            }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextWriteBack for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        } else {
+                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
+                                            "not need to be written back.\n",
+                                                        __func__, block_index);
+                            cacheBlocks[block_index].addr = aligned_addr;
+                            cacheBlocks[block_index].valid = false;
+                            cacheBlocks[block_index].busyMask = 0;
+                            cacheBlocks[block_index].needsWB = false;
+                            cacheBlocks[block_index].needsApply = false;
+                            cacheBlocks[block_index].pendingData = true;
+                            cacheBlocks[block_index].pendingApply = false;
+                            cacheBlocks[block_index].pendingWB = false;
+                            cacheBlocks[block_index].lastChangedTick = curTick();
+                            memoryFunctionQueue.emplace_back(
+                                [this] (int block_index, Tick schedule_tick) {
+                                    processNextRead(block_index, schedule_tick);
+                                }, block_index, curTick());
+                            DPRINTF(CoalesceEngine, "%s: Pushed "
+                                        "processNextRead for input "
+                                        "%d to memoryFunctionQueue.\n",
+                                        __func__, block_index);
+                            if ((!nextMemoryEvent.pending()) &&
+                                (!nextMemoryEvent.scheduled())) {
+                                schedule(nextMemoryEvent, nextCycle());
+                            }
+                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
+                                    "%s.\n", __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                        }
+                    }
+                    // cacheBlocks[block_index].hasConflict = true;
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    stats.readMisses++;
+                    // TODO: Add readConflicts here.
+                    stats.numVertexReads++;
+                    return true;
+                } else {
+                    // MSHR available and no conflict
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
+                                            "Allocating a cache line for it.\n"
+                                                            , __func__, addr);
+                    assert(!cacheBlocks[block_index].valid);
+                    assert(cacheBlocks[block_index].busyMask == 0);
+                    assert(!cacheBlocks[block_index].needsWB);
+                    assert(!cacheBlocks[block_index].needsApply);
+                    assert(!cacheBlocks[block_index].pendingData);
+                    assert(!cacheBlocks[block_index].pendingApply);
+                    assert(!cacheBlocks[block_index].pendingWB);
+                    assert(MSHR[block_index].size() == 0);
+
+                    cacheBlocks[block_index].addr = aligned_addr;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
+                                " Addr: %lu.\n", __func__, block_index, addr);
+                    MSHR[block_index].push_back(addr);
+                    stats.mshrEntryLength.sample(MSHR[block_index].size());
+                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
+                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
+                                        "input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+                                    __func__, block_index,
+                                    cacheBlocks[block_index].to_string());
+                    stats.readMisses++;
+                    stats.numVertexReads++;
+                    return true;
+                }
+            }
+        } else {
+            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
+                "Addr: %lu already in MSHRs. It has a conflict "
+                "with addr: %lu.\n", __func__, block_index, addr,
+                                cacheBlocks[block_index].addr);
+            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
+            assert(MSHR[block_index].size() > 0);
+            if (MSHR[block_index].size() == numTgtsPerMSHR) {
+                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
+                            "cacheBlocks[%d]. Rejecting request.\n",
+                                            __func__, block_index);
+                stats.mshrTargetShortage++;
+                return false;
+            }
+            DPRINTF(CoalesceEngine, "%s: There is room for another target "
+                            "for cacheBlocks[%d].\n", __func__, block_index);
+
+            // TODO: Might want to differentiate between different misses.
+            stats.readMisses++;
+
+            MSHR[block_index].push_back(addr);
+            stats.mshrEntryLength.sample(MSHR[block_index].size());
+            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
+                            "cacheBlocks[%d].\n", __func__, addr, block_index);
+            stats.numVertexReads++;
+            return true;
+        }
+    }
+}
+
+bool
+CoalesceEngine::handleMemResp(PacketPtr pkt)
+{
+    assert(pkt->isResponse());
+    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
+                                                __func__, pkt->print());
+    if (pkt->isWrite()) {
+        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
+        delete pkt;
+        return true;
+    }
+
+    onTheFlyReqs--;
+    Addr addr = pkt->getAddr();
+    int block_index = getBlockIndex(addr);
+    WorkListItem* items = pkt->getPtr<WorkListItem>();
+
+    bool do_wb = false;
+    if (pkt->findNextSenderState<SenderState>()) {
+        assert(!((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid)));
+        // We have read the address to send the wl and it is not in the
+        // cache. Simply send the items to the PushEngine.
+
+        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
+                                "for addr %lu.\n", __func__, addr);
+        int it = getBitIndexBase(addr);
+        uint64_t send_mask = pendingVertexPullReads[addr];
+        // No applying of the line needed.
+        for (int i = 0; i < numElementsPerLine; i++) {
+            Addr vertex_addr = addr + i * sizeof(WorkListItem);
+            uint64_t vertex_send_mask = send_mask & (1 << i);
+            if (vertex_send_mask != 0) {
+                assert(needsPush[it + i] == 1);
+                needsPush[it + i] = 0;
+                _workCount--;
+
+                uint32_t delta;
+                bool do_push, do_wb_v;
+                std::tie(delta, do_push, do_wb_v) =
+                                        graphWorkload->prePushApply(items[i]);
+                do_wb |= do_wb_v;
+                if (do_push) {
+                    owner->recvVertexPush(vertex_addr, delta,
+                                        items[i].edgeIndex, items[i].degree);
+                } else {
+                    // TODO: Add a stat to count this.
+                    owner->recvPrevPullCorrection();
+                }
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            }
+        }
+        pendingVertexPullReads.erase(addr);
+        maxPotentialPostPushWB--;
+    }
+
+    bool cache_wb = false;
+    if (cacheBlocks[block_index].addr == addr) {
+        DPRINTF(CoalesceEngine, "%s: Received read response to "
+                        "fill cacheBlocks[%d].\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+        assert(!cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(!cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(!cacheBlocks[block_index].pendingWB);
+        assert(MSHR.find(block_index) != MSHR.end());
+        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
+        for (int i = 0; i < numElementsPerLine; i++) {
+            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, i, graphWorkload->printWorkListItem(
+                                        cacheBlocks[block_index].items[i]));
+        }
+        cacheBlocks[block_index].valid = true;
+        cacheBlocks[block_index].needsWB |= do_wb;
+        cacheBlocks[block_index].pendingData = false;
+        // HACK: In case processNextRead is called on the same tick as curTick
+        // and is scheduled to read to the same cacheBlocks[block_index]
+        cacheBlocks[block_index].lastChangedTick =
+                                        curTick() - (Tick) (clockPeriod() / 2);
+        cache_wb = true;
+    } else if (do_wb) {
+        PacketPtr wb_pkt = createWritePacket(
+                                addr, peerMemoryAtomSize, (uint8_t*) items);
+        postPushWBQueue.emplace_back(wb_pkt, curTick());
+        memoryFunctionQueue.emplace_back(
+            [this] (int ignore, Tick schedule_tick) {
+                processNextPostPushWB(ignore, schedule_tick);
+            }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    } else {
+        // TODO: Add a stat to count this.
+        // FIXME: This is not a totally wasteful read. e.g. all reads
+        // for pull in BFS are like this.
+        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
+    }
+
+    if (cache_wb) {
+        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+            Addr miss_addr = *it;
+            Addr aligned_miss_addr =
+                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+            if (aligned_miss_addr == addr) {
+                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                            "cacheBlocks[%d] can be serviced with the received "
+                            "packet.\n",__func__, miss_addr, block_index);
+                // TODO: Make this block of code into a function
+                responseQueue.push_back(std::make_tuple(miss_addr,
+                        cacheBlocks[block_index].items[wl_offset], curTick()));
+                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, miss_addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                            "to responseQueue. responseQueue.size = %d.\n",
+                            __func__, addr,
+                            graphWorkload->printWorkListItem(
+                                cacheBlocks[block_index].items[wl_offset]),
+                            responseQueue.size());
+                // TODO: Add a stat to count the number of WLItems that have been touched.
+                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                // cacheBlocks[block_index].lastChangedTick = curTick();
+                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                            block_index, cacheBlocks[block_index].to_string());
+                it = MSHR[block_index].erase(it);
+            } else {
+                it++;
+            }
+        }
+    }
+
+    if (MSHR[block_index].empty()) {
+        MSHR.erase(block_index);
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+
+
+    // TODO: Probably check for done here too.
+    delete pkt;
+    return true;
+}
+
+// TODO: For loop to empty the entire responseQueue.
+void
+CoalesceEngine::processNextResponseEvent()
+{
+    int num_responses_sent = 0;
+
+    Addr addr_response;
+    WorkListItem worklist_response;
+    Tick response_queueing_tick;
+    while(true) {
+        std::tie(addr_response, worklist_response, response_queueing_tick) =
+                                                        responseQueue.front();
+        Tick waiting_ticks = curTick() - response_queueing_tick;
+        if (ticksToCycles(waiting_ticks) < 1) {
+            break;
+        }
+        owner->handleIncomingWL(addr_response, worklist_response);
+        num_responses_sent++;
+        DPRINTF(CoalesceEngine,
+                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
+                    __func__,
+                    graphWorkload->printWorkListItem(worklist_response),
+                    addr_response);
+
+        responseQueue.pop_front();
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
+                    "responseQueue.size = %d.\n", __func__,
+                    responseQueue.size());
+        stats.responseQueueLatency.sample(
+                                    waiting_ticks * 1e9 / getClockFrequency());
+        if (num_responses_sent >= maxRespPerCycle) {
+            if (!responseQueue.empty()) {
+                stats.responsePortShortage++;
+            }
+            break;
+        }
+        if (responseQueue.empty()) {
+            break;
+        }
+    }
+
+    if ((!nextResponseEvent.scheduled()) &&
+        (!responseQueue.empty())) {
+        schedule(nextResponseEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
+{
+    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
+    int block_index = getBlockIndex(aligned_addr);
+    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
+    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
+                        "wl: %s. This request maps to cacheBlocks[%d], "
+                        "aligned_addr: %lu, and wl_offset: %d.\n",
+                        __func__, addr, graphWorkload->printWorkListItem(wl),
+                        block_index, aligned_addr, wl_offset);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
+                "with Addr: %lu.\n", __func__,
+                graphWorkload->printWorkListItem(wl), addr);
+    // Desing does not allow for write misses for now.
+    assert(cacheBlocks[block_index].addr == aligned_addr);
+    // cache state asserts
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask != 0);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    // respective bit in busyMask for wl is set.
+    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
+            (1 << wl_offset));
+
+    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
+        cacheBlocks[block_index].needsWB |= true;
+        stats.numVertexWrites++;
+    }
+    cacheBlocks[block_index].items[wl_offset] = wl;
+    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
+        cacheBlocks[block_index].needsApply |= true;
+        cacheBlocks[block_index].needsWB |= true;
+    }
+
+    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
+    cacheBlocks[block_index].lastChangedTick = curTick();
+    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
+                __func__, block_index, wl_offset,
+                graphWorkload->printWorkListItem(
+                    cacheBlocks[block_index].items[wl_offset]));
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                        block_index, cacheBlocks[block_index].to_string());
+
+    // TODO: Make this more general and programmable.
+    if ((cacheBlocks[block_index].busyMask == 0)) {
+        if (cacheBlocks[block_index].needsApply) {
+            cacheBlocks[block_index].pendingApply = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            applyQueue.push_back(block_index);
+            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
+                            "applyQueue.\n", __func__, block_index);
+            if ((!applyQueue.empty()) &&
+                (!nextPreWBApplyEvent.scheduled())) {
+                schedule(nextPreWBApplyEvent, nextCycle());
+            }
+        } else {
+            assert(MSHR.size() <= numMSHREntries);
+            // cache line has conflict.
+            if (MSHR.find(block_index) != MSHR.end()) {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                    "conflict.\n", __func__, block_index);
+                if (cacheBlocks[block_index].needsWB) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
+                                            " back.\n", __func__, block_index);
+                    cacheBlocks[block_index].pendingWB = true;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextWriteBack(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                } else {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
+                                    " a write back.\n", __func__, block_index);
+                    Addr miss_addr = MSHR[block_index].front();
+                    Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                        __func__, block_index, miss_addr, aligned_miss_addr);
+                    cacheBlocks[block_index].addr = aligned_miss_addr;
+                    cacheBlocks[block_index].valid = false;
+                    cacheBlocks[block_index].busyMask = 0;
+                    cacheBlocks[block_index].needsWB = false;
+                    cacheBlocks[block_index].needsApply = false;
+                    cacheBlocks[block_index].pendingData = true;
+                    cacheBlocks[block_index].pendingApply = false;
+                    cacheBlocks[block_index].pendingWB = false;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    memoryFunctionQueue.emplace_back(
+                        [this] (int block_index, Tick schedule_tick) {
+                            processNextRead(block_index, schedule_tick);
+                        }, block_index, curTick());
+                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
+                                    "for input %d to memoryFunctionQueue.\n",
+                                                    __func__, block_index);
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
+                }
+            } else {
+                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                        "idle state now.\n", __func__, block_index);
+            }
+        }
+    }
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+
+}
+
+void
+CoalesceEngine::processNextPreWBApplyEvent()
+{
+    int block_index = applyQueue.front();
+    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
+                "cacheBlock[%d] to be applied.\n", __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+            __func__, block_index, cacheBlocks[block_index].to_string());
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].needsApply);
+    assert(!cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    if (cacheBlocks[block_index].pendingApply) {
+        assert(cacheBlocks[block_index].busyMask == 0);
+        for (int index = 0; index < numElementsPerLine; index++) {
+            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
+            if (do_push) {
+                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
+                if (needsPush[bit_index_base + index] == 0) {
+                    needsPush[bit_index_base + index] = 1;
+                    _workCount++;
+                    activeBits.push_back(bit_index_base + index);
+                    if (!owner->running()) {
+                        owner->start();
+                    }
+                }
+            }
+        }
+        stats.bitvectorLength.sample(needsPush.count());
+
+        assert(cacheBlocks[block_index].needsWB);
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+
+        assert(MSHR.size() <= numMSHREntries);
+        if (MSHR.find(block_index) != MSHR.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
+                                "conflicts.\n", __func__, block_index);
+            cacheBlocks[block_index].pendingWB = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                processNextWriteBack(block_index, schedule_tick);
+            }, block_index, curTick());
+            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
+                    " %d to memoryFunctionQueue.\n", __func__, block_index);
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+        } else {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
+                    "idle state now.\n", __func__, block_index);
+        }
+        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        stats.numInvalidApplies++;
+    }
+
+    applyQueue.pop_front();
+    if ((!applyQueue.empty()) &&
+        (!nextPreWBApplyEvent.scheduled())) {
+        schedule(nextPreWBApplyEvent, nextCycle());
+    }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
+
+void
+CoalesceEngine::processNextMemoryEvent()
+{
+    if (memPort.blocked()) {
+        stats.numMemoryBlocks++;
+        nextMemoryEvent.sleep();
+        return;
+    }
+
+    DPRINTF(CoalesceEngine, "%s: Processing another "
+                        "memory function.\n", __func__);
+    std::function<void(int, Tick)> next_memory_function;
+    int next_memory_function_input;
+    Tick next_memory_function_tick;
+    std::tie(
+        next_memory_function,
+        next_memory_function_input,
+        next_memory_function_tick) = memoryFunctionQueue.front();
+    next_memory_function(next_memory_function_input, next_memory_function_tick);
+    memoryFunctionQueue.pop_front();
+    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
+                                                * 1e9 / getClockFrequency());
+    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
+                                "memoryFunctionQueue.size = %d.\n", __func__,
+                                memoryFunctionQueue.size());
+
+    assert(!nextMemoryEvent.pending());
+    assert(!nextMemoryEvent.scheduled());
+    if ((!memoryFunctionQueue.empty())) {
+        schedule(nextMemoryEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
+                                            __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
+        __func__, block_index, cacheBlocks[block_index].to_string());
+    // A cache block should not be touched while it's waiting for data.
+    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
+
+    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
+        return;
+    }
+
+    assert(!cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].busyMask == 0);
+    assert(!cacheBlocks[block_index].needsWB);
+    assert(!cacheBlocks[block_index].needsApply);
+    assert(cacheBlocks[block_index].pendingData);
+    assert(!cacheBlocks[block_index].pendingApply);
+    assert(!cacheBlocks[block_index].pendingWB);
+
+    bool need_send_pkt = true;
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    {
+        PacketPtr wb_pkt = std::get<0>(*wb);
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
+            wb_pkt->writeDataToBlock(
+                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].needsWB = true;
+            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+
+                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
+                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
+                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
+                                "cacheBlocks[%d] can be serviced with the received "
+                                "packet.\n",__func__, miss_addr, block_index);
+                    // TODO: Make this block of code into a function
+                    responseQueue.push_back(std::make_tuple(miss_addr,
+                            cacheBlocks[block_index].items[wl_offset], curTick()));
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
+                                "to responseQueue. responseQueue.size = %d.\n",
+                                __func__, miss_addr,
+                                graphWorkload->printWorkListItem(
+                                    cacheBlocks[block_index].items[wl_offset]),
+                                responseQueue.size());
+                    // TODO: Add a stat to count the number of WLItems that have been touched.
+                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
+                    cacheBlocks[block_index].lastChangedTick = curTick();
+                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                                block_index, cacheBlocks[block_index].to_string());
+                    it = MSHR[block_index].erase(it);
+                } else {
+                    it++;
+                }
+            }
+            if (MSHR[block_index].empty()) {
+                MSHR.erase(block_index);
+            }
+
+            if ((!nextResponseEvent.scheduled()) &&
+                (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            postPushWBQueue.erase(wb);
+            need_send_pkt = false;
+        }
+    }
+
+    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
+        pendingVertexPullReads.end()) {
+        need_send_pkt = false;
+    }
+
+    if (need_send_pkt) {
+        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
+                                        peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
+                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        onTheFlyReqs++;
+
+        if (pendingVertexPullReads.find(pkt->getAddr()) !=
+            pendingVertexPullReads.end()) {
+            stats.numDoubleMemReads++;
+        }
+    }
+}
+
+void
+CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
+{
+    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
+                                                __func__, block_index);
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
+        assert(cacheBlocks[block_index].valid);
+        assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].needsWB);
+        assert(!cacheBlocks[block_index].needsApply);
+        assert(!cacheBlocks[block_index].pendingData);
+        assert(!cacheBlocks[block_index].pendingApply);
+        assert(cacheBlocks[block_index].pendingWB);
+
+        // Why would we write it back if it does not have a conflict.
+        assert(MSHR.size() <= numMSHREntries);
+        assert(MSHR.find(block_index) != MSHR.end());
+
+        PacketPtr pkt = createWritePacket(
+                cacheBlocks[block_index].addr, peerMemoryAtomSize,
+                (uint8_t*) cacheBlocks[block_index].items);
+        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
+                        "Addr: %lu, size = %d.\n", __func__,
+                        pkt->getAddr(), pkt->getSize());
+        memPort.sendPacket(pkt);
+        // onTheFlyReqs++;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].pendingWB = false;
+
+        Addr miss_addr = MSHR[block_index].front();
+        Addr aligned_miss_addr =
+                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
+                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
+                    __func__, block_index, miss_addr, aligned_miss_addr);
+
+        cacheBlocks[block_index].addr = aligned_miss_addr;
+        cacheBlocks[block_index].valid = false;
+        cacheBlocks[block_index].busyMask = 0;
+        cacheBlocks[block_index].needsWB = false;
+        cacheBlocks[block_index].needsApply = false;
+        cacheBlocks[block_index].pendingData = true;
+        cacheBlocks[block_index].pendingApply = false;
+        cacheBlocks[block_index].pendingWB = false;
+        cacheBlocks[block_index].lastChangedTick = curTick();
+        memoryFunctionQueue.emplace_back(
+            [this] (int block_index, Tick schedule_tick) {
+            processNextRead(block_index, schedule_tick);
+        }, block_index, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
+                " %d to memoryFunctionQueue.\n", __func__, block_index);
+        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                    block_index, cacheBlocks[block_index].to_string());
+    } else {
+        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
+                            "write back has been scheduled for it. Ignoring "
+                            "the current write back scheduled at tick %lu for "
+                            "the right function scheduled later.\n",
+                            __func__, block_index, schedule_tick);
+        stats.numInvalidWriteBacks++;
+    }
+}
+
+void
+CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
+{
+    PacketPtr wb_pkt;
+    Tick pkt_tick;
+    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+    if (schedule_tick == pkt_tick) {
+        memPort.sendPacket(wb_pkt);
+        postPushWBQueue.pop_front();
+    }
+}
+
+std::tuple<WorkLocation, Addr, int>
+CoalesceEngine::getOptimalPullAddr()
+{
+    int visited_bits = 0;
+    int num_intial_active_bits = activeBits.size();
+    while (visited_bits < num_intial_active_bits) {
+        int index = activeBits.front();
+        int base_index = roundDown<int, int>(index, numElementsPerLine);
+        int index_offset = index - base_index;
+        assert(needsPush[index] == 1);
+        assert(index_offset < numElementsPerLine);
+
+        Addr addr = getBlockAddrFromBitIndex(base_index);
+        int block_index = getBlockIndex(addr);
+        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
+        {
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            activeBits.pop_front();
+            return std::make_tuple(
+                                WorkLocation::PENDING_READ, addr, index_offset);
+        } else {
+            // Only if it is in cache and it is in idle state.
+            if ((cacheBlocks[block_index].addr == addr) &&
+                (cacheBlocks[block_index].valid) &&
+                (cacheBlocks[block_index].busyMask == 0) &&
+                (!cacheBlocks[block_index].pendingApply) &&
+                (!cacheBlocks[block_index].pendingWB)) {
+                assert(!cacheBlocks[block_index].needsApply);
+                assert(!cacheBlocks[block_index].pendingData);
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_CACHE, block_index, index_offset);
+            // Otherwise if it is in memory
+            } else if ((cacheBlocks[block_index].addr != addr)) {
+                activeBits.pop_front();
+                return std::make_tuple(
+                            WorkLocation::IN_MEMORY, addr, index_offset);
+            }
+        }
+        activeBits.pop_front();
+        activeBits.push_back(index);
+        visited_bits++;
+    }
+
+    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
+}
+
+void
+CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
+{
+    WorkLocation bit_status;
+    Addr location;
+    int offset;
+
+    std::tie(bit_status, location, offset) = getOptimalPullAddr();
+
+    if (bit_status != WorkLocation::GARBAGE) {
+        if (bit_status == WorkLocation::PENDING_READ) {
+            // renaming the outputs to thier local names.
+            Addr addr = location;
+            int index_offset = offset;
+
+            uint64_t send_mask = pendingVertexPullReads[addr];
+            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
+            assert(vertex_send_mask == 0);
+            send_mask |= (1 << index_offset);
+            pendingVertexPullReads[addr] = send_mask;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_CACHE) {
+            // renaming the outputs to their local names.
+            int block_index = (int) location;
+            int wl_offset = offset;
+
+            Addr addr = cacheBlocks[block_index].addr;
+            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
+            int slice_base_index = getBitIndexBase(addr);
+
+            needsPush[slice_base_index + wl_offset] = 0;
+            _workCount--;
+
+            uint32_t delta;
+            bool do_push, do_wb;
+            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
+                                    cacheBlocks[block_index].items[wl_offset]);
+            cacheBlocks[block_index].needsWB |= do_wb;
+            if (do_push) {
+                owner->recvVertexPush(vertex_addr, delta,
+                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
+                        cacheBlocks[block_index].items[wl_offset].degree);
+            } else {
+                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
+                owner->recvPrevPullCorrection();
+            }
+            stats.verticesPushed++;
+            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
+            numPullsReceived--;
+        }
+        if (bit_status == WorkLocation::IN_MEMORY) {
+            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
+                Addr addr = location;
+                int index_offset = offset;
+                uint64_t send_mask = (1 << index_offset);
+                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
+                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+                SenderState* sender_state = new SenderState(true);
+                pkt->pushSenderState(sender_state);
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+                maxPotentialPostPushWB++;
+                pendingVertexPullReads[addr] = send_mask;
+                numPullsReceived--;
+            }
+        }
+    }
+
+    stats.bitvectorSearchStatus[bit_status]++;
+
+    if (numPullsReceived > 0) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
+                                    "0 to memoryFunctionQueue.\n", __func__);
+    }
+}
+
+void
+CoalesceEngine::recvMemRetry()
+{
+    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
+
+    if (!nextMemoryEvent.pending()) {
+        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
+        return;
+    }
+    assert(!nextMemoryEvent.scheduled());
+    nextMemoryEvent.wake();
+    schedule(nextMemoryEvent, nextCycle());
+}
+
+void
+CoalesceEngine::recvVertexPull()
+{
+    bool should_schedule = (numPullsReceived == 0);
+    numPullsReceived++;
+
+    stats.verticesPulled++;
+    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
+    if (should_schedule) {
+        memoryFunctionQueue.emplace_back(
+            [this] (int slice_base, Tick schedule_tick) {
+            processNextVertexPull(slice_base, schedule_tick);
+        }, 0, curTick());
+        if ((!nextMemoryEvent.pending()) &&
+            (!nextMemoryEvent.scheduled())) {
+            schedule(nextMemoryEvent, nextCycle());
+        }
+    }
+}
+
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
+    : statistics::Group(&_coalesce),
+    coalesce(_coalesce),
+    lastResetTick(0),
+    ADD_STAT(numVertexReads, statistics::units::Count::get(),
+             "Number of memory vertecies read from cache."),
+    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
+             "Number of memory vertecies written to cache."),
+    ADD_STAT(readHits, statistics::units::Count::get(),
+             "Number of cache hits."),
+    ADD_STAT(readMisses, statistics::units::Count::get(),
+             "Number of cache misses."),
+    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
+             "Number of cache hit under misses."),
+    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by entry shortage."),
+    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
+             "Number of cache rejections caused by target shortage."),
+    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
+             "Number of times a response has been "
+             "delayed because of port shortage. "),
+    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
+             "Number of times memory bandwidth was not available."),
+    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
+             "Number of times a memory block has been read twice. "
+             "Once for push and once to populate the cache."),
+    ADD_STAT(verticesPulled, statistics::units::Count::get(),
+             "Number of times a pull request has been sent by PushEngine."),
+    ADD_STAT(verticesPushed, statistics::units::Count::get(),
+             "Number of times a vertex has been pushed to the PushEngine"),
+    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
+             "Time of the last pull request. (Relative to reset_stats)"),
+    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
+             "Time of the last vertex push. (Relative to reset_stats)"),
+    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
+             "Number of times a line has become busy"
+             " while waiting to be applied."),
+    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
+             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
+             "Distribution for the location of vertex searches."),
+    ADD_STAT(hitRate, statistics::units::Ratio::get(),
+             "Hit rate in the cache."),
+    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which pull requests arrive."),
+    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
+                                            statistics::units::Second>::get(),
+             "Rate at which vertices are pushed."),
+    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
+             "Histogram on the length of the mshr entries."),
+    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+             "Histogram of the length of the bitvector."),
+    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
+             "Histogram of the response latency to WLEngine. (ns)"),
+    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+             "Histogram of the latency of processing a memory function.")
+{
+}
+
+void
+CoalesceEngine::CoalesceStats::regStats()
+{
+    using namespace statistics;
+
+    bitvectorSearchStatus.init(NUM_STATUS);
+    bitvectorSearchStatus.subname(0, "PENDING_READ");
+    bitvectorSearchStatus.subname(1, "IN_CACHE");
+    bitvectorSearchStatus.subname(2, "IN_MEMORY");
+    bitvectorSearchStatus.subname(3, "GARBAGE");
+
+    hitRate = (readHits + readHitUnderMisses) /
+                (readHits + readHitUnderMisses + readMisses);
+
+    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
+
+    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
+
+    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
+    bitvectorLength.init(64);
+    responseQueueLatency.init(64);
+    memoryFunctionLatency.init(64);
+}
+
+void
+CoalesceEngine::CoalesceStats::resetStats()
+{
+    statistics::Group::resetStats();
+
+    lastResetTick = curTick();
+}
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
new file mode 100644
index 0000000000..0787a334c1
--- /dev/null
+++ b/src/accl/graph/sega/coalesce_engine_bak.hh
@@ -0,0 +1,218 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
+
+#include <bitset>
+
+#include "accl/graph/base/data_structs.hh"
+#include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
+#include "base/cprintf.hh"
+#include "base/statistics.hh"
+#include "params/CoalesceEngine.hh"
+
+
+
+namespace gem5
+{
+
+enum WorkLocation
+{
+    PENDING_READ,
+    IN_CACHE,
+    IN_MEMORY,
+    GARBAGE,
+    NUM_STATUS
+};
+
+class MPU;
+
+class CoalesceEngine : public BaseMemoryEngine
+{
+  private:
+    struct Block
+    {
+        WorkListItem* items;
+        Addr addr;
+        uint64_t busyMask;
+        bool valid;
+        bool needsApply;
+        bool needsWB;
+        bool pendingData;
+        bool pendingApply;
+        bool pendingWB;
+        Tick lastChangedTick;
+        // TODO: This might be useful in the future
+        // Tick lastWLWriteTick;
+        Block() {}
+        Block(int num_elements):
+          addr(-1),
+          busyMask(0),
+          valid(false),
+          needsApply(false),
+          needsWB(false),
+          pendingData(false),
+          pendingApply(false),
+          pendingWB(false),
+          lastChangedTick(0),
+        {
+          items = new WorkListItem [num_elements];
+        }
+
+        std::string to_string() {
+            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
+                "needsApply: %s, needsWB: %s, pendingData: %s, "
+                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                needsApply ? "true" : "false", needsWB ? "true" : "false",
+                pendingData ? "true" : "false", pendingApply ? "true" : "false",
+                pendingWB ? "true" : "false", lastChangedTick);
+        }
+    };
+
+    struct SenderState : public Packet::SenderState
+    {
+      bool isRetry;
+      SenderState(bool is_retry): isRetry(is_retry) {}
+    };
+    MPU* owner;
+    GraphWorkload* graphWorkload;
+
+    int numLines;
+    int numElementsPerLine;
+    Block* cacheBlocks;
+
+    int onTheFlyReqs;
+    int numMSHREntries;
+    int numTgtsPerMSHR;
+    std::unordered_map<int, std::vector<Addr>> MSHR;
+    int maxRespPerCycle;
+    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
+
+    int _workCount;
+    int numPullsReceived;
+    UniqueFIFO<int> applyQueue;
+    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
+    std::deque<int> activeBits;
+    int postPushWBQueueSize;
+    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
+
+    int getBlockIndex(Addr addr);
+    int getBitIndexBase(Addr addr);
+    Addr getBlockAddrFromBitIndex(int index);
+    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
+
+    int maxPotentialPostPushWB;
+    // A map from addr to sendMask. sendMask determines which bytes to
+    // send for push when getting the read response from memory.
+    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+
+    MemoryEvent nextMemoryEvent;
+    void processNextMemoryEvent();
+    void processNextRead(int block_index, Tick schedule_tick);
+    void processNextWriteBack(int block_index, Tick schedule_tick);
+    void processNextVertexPull(int ignore, Tick schedule_tick);
+    void processNextPostPushWB(int ignore, Tick schedule_tick);
+    std::deque<std::tuple<
+        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
+
+    EventFunctionWrapper nextResponseEvent;
+    void processNextResponseEvent();
+
+    EventFunctionWrapper nextPreWBApplyEvent;
+    void processNextPreWBApplyEvent();
+
+    struct CoalesceStats : public statistics::Group
+    {
+        CoalesceStats(CoalesceEngine &coalesce);
+
+        virtual void regStats() override;
+
+        virtual void resetStats() override;
+
+        CoalesceEngine &coalesce;
+
+        Tick lastResetTick;
+
+        statistics::Scalar numVertexReads;
+        statistics::Scalar numVertexWrites;
+        statistics::Scalar readHits;
+        statistics::Scalar readMisses;
+        statistics::Scalar readHitUnderMisses;
+        statistics::Scalar mshrEntryShortage;
+        statistics::Scalar mshrTargetShortage;
+        statistics::Scalar responsePortShortage;
+        statistics::Scalar numMemoryBlocks;
+        statistics::Scalar numDoubleMemReads;
+        statistics::Scalar verticesPulled;
+        statistics::Scalar verticesPushed;
+        statistics::Scalar lastVertexPullTime;
+        statistics::Scalar lastVertexPushTime;
+        statistics::Scalar numInvalidApplies;
+        statistics::Scalar numInvalidWriteBacks;
+
+        statistics::Vector bitvectorSearchStatus;
+
+        statistics::Formula hitRate;
+        statistics::Formula vertexPullBW;
+        statistics::Formula vertexPushBW;
+
+        statistics::Histogram mshrEntryLength;
+        statistics::Histogram bitvectorLength;
+        statistics::Histogram responseQueueLatency;
+        statistics::Histogram memoryFunctionLatency;
+    };
+
+    CoalesceStats stats;
+
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
+
+  public:
+    PARAMS(CoalesceEngine);
+    CoalesceEngine(const Params &params);
+    void registerMPU(MPU* mpu);
+
+    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
+    virtual void recvFunctional(PacketPtr pkt);
+
+    bool recvWLRead(Addr addr);
+    void recvWLWrite(Addr addr, WorkListItem wl);
+
+    int workCount() { return _workCount; }
+    void recvVertexPull();
+
+    bool done();
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__

From 80b3803f040e09cae9f083e39d637c6445aab247 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 00:05:27 -0800
Subject: [PATCH 211/287] First working and tested version of workdirectory.

---
 configs/accl/bfs.py                        |    1 +
 configs/accl/sega.py                       |    6 +-
 src/accl/graph/base/data_structs.hh        |   23 +-
 src/accl/graph/base/graph_workload.cc      |  236 ++--
 src/accl/graph/base/graph_workload.hh      |   67 +-
 src/accl/graph/sega/CenteralController.py  |    2 +-
 src/accl/graph/sega/CoalesceEngine.py      |    7 +-
 src/accl/graph/sega/CoalesceEngine_bak.py  |   50 -
 src/accl/graph/sega/SConscript             |    5 +-
 src/accl/graph/sega/centeral_controller.cc |   14 +-
 src/accl/graph/sega/centeral_controller.hh |    3 +-
 src/accl/graph/sega/coalesce_engine.cc     |  932 +++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  117 +-
 src/accl/graph/sega/coalesce_engine_bak.cc | 1308 --------------------
 src/accl/graph/sega/coalesce_engine_bak.hh |  218 ----
 src/accl/graph/sega/enums.cc               |   57 +
 src/accl/graph/sega/enums.hh               |   66 +
 src/accl/graph/sega/mpu.cc                 |    6 -
 src/accl/graph/sega/mpu.hh                 |    6 +-
 src/accl/graph/sega/push_engine.cc         |   37 +-
 src/accl/graph/sega/push_engine.hh         |    2 +-
 src/accl/graph/sega/wl_engine.cc           |   28 +-
 src/accl/graph/sega/wl_engine.hh           |    1 +
 src/accl/graph/sega/work_directory.hh      |  212 ++++
 src/mem/mem_ctrl.cc                        |    2 +-
 25 files changed, 1030 insertions(+), 2376 deletions(-)
 delete mode 100644 src/accl/graph/sega/CoalesceEngine_bak.py
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.cc
 delete mode 100644 src/accl/graph/sega/coalesce_engine_bak.hh
 create mode 100644 src/accl/graph/sega/enums.cc
 create mode 100644 src/accl/graph/sega/enums.hh
 create mode 100644 src/accl/graph/sega/work_directory.hh

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index fc32b96642..a201acd4d1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -68,6 +68,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.create_pop_count_directory(256)
     system.create_bfs_workload(init_addr, init_value)
     exit_event = m5.simulate()
     print(
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 0f4b133791..54f22b1377 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -54,8 +54,8 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             num_mshr_entry=64,
-            num_tgts_per_mshr=64,
             max_resp_per_cycle=8,
+            active_buffer_size = 64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -139,6 +139,10 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 070e635736..84233ae39c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -36,8 +36,6 @@
 #include <cassert>
 #include <list>
 
-#define MAX_BITVECTOR_SIZE (1 << 28)
-
 namespace gem5
 {
 
@@ -45,33 +43,28 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
+    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
-    uint32_t degree : 31;
-    bool active: 1;
 
     std::string to_string()
     {
-        return csprintf(
-                "WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                "degree: %u, active: %s}", tempProp, prop, edgeIndex, degree,
-                active ? "true" : "false");
+        return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
+                            "degree: %u}", tempProp, prop, edgeIndex, degree);
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
-        edgeIndex(0),
         degree(0),
-        active(false)
+        edgeIndex(0)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t edge_index, uint32_t degree, bool active):
+                uint32_t degree, uint32_t edge_index):
         tempProp(temp_prop),
         prop(prop),
-        edgeIndex(edge_index),
         degree(degree),
-        active(active)
+        edgeIndex(edge_index)
     {}
 
 };
@@ -111,8 +104,8 @@ struct MetaEdge {
 
     std::string to_string()
     {
-        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u}",
-                                                    src, dst, weight);
+        return csprintf("MetaEdge{src: %lu, dst:%lu, weight: %u, value: %u}",
+                                                    src, dst, weight, value);
     }
 };
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 07accff44f..446509201f 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -56,39 +56,27 @@ readFromFloat(float value)
     return float_bits;
 }
 
-BFSWorkload::BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size):
-    GraphWorkload(), initValue(init_value), atomSize(atom_size)
-{
-    initAddrBase = roundDown<uint64_t, int>(init_addr, atomSize);
-    initIndex = (init_addr - initAddrBase) / atomSize;
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-
 void
-BFSWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
+BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
-    if (pkt->getAddr() == initAddrBase) {
-        WorkListItem items[numElementsPerLine];
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
 
-        pkt->writeDataToBlock((uint8_t*) items, atomSize);
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
 
-        items[initIndex].tempProp = initValue;
-        items[initIndex].prop = initValue;
-        if (items[initIndex].degree > 0) {
-            needsPush[bit_index_base + initIndex] = 1;
-            activeBits.push_back(bit_index_base + initIndex);
-            _workCount++;
-        }
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
 
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
         pkt->deleteData();
         pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, atomSize);
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
     }
-
 }
 
 uint32_t
@@ -104,28 +92,16 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::applyCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem wl)
 {
-    return wl.tempProp < wl.prop;
-}
-
-bool
-BFSWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        wl.prop = wl.tempProp;
-        if (wl.degree > 0) {
-            return true;
-        }
-    }
-    return false;
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
 }
 
-std::tuple<uint32_t, bool, bool>
-BFSWorkload::prePushApply(WorkListItem& wl)
+uint32_t
+BFSWorkload::apply(WorkListItem& wl)
 {
-    uint32_t value = wl.prop;
-    return std::make_tuple(value, true, false);
+    wl.prop = wl.tempProp;
+    return wl.prop;
 }
 
 std::string
@@ -137,92 +113,92 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-    GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-{
-    numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-}
-
-void
-PRWorkload::init(PacketPtr pkt, int bit_index_base,
-                std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                std::deque<int>& activeBits,
-                int& _workCount)
-{
-    WorkListItem items[numElementsPerLine];
-
-    pkt->writeDataToBlock((uint8_t*) items, atomSize);
-    for (int i = 0; i < numElementsPerLine; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        if (items[i].degree > 0) {
-            needsPush[bit_index_base + i] = 1;
-            activeBits.push_back(bit_index_base + i);
-            _workCount++;
-        }
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, atomSize);
-}
-
-uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
-{
-    float update_float = writeToFloat<uint32_t>(update);
-    float value_float = writeToFloat<uint32_t>(value);
-    return readFromFloat<uint32_t>(update_float + value_float);
-}
-
-uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = 1.0;
-
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-}
-
-bool
-PRWorkload::applyCondition(WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
-}
-
-bool
-PRWorkload::preWBApply(WorkListItem& wl)
-{
-    if (applyCondition(wl) && (wl.degree > 0)) {
-        return true;
-    }
-    return false;
-}
-
-std::tuple<uint32_t, bool, bool>
-PRWorkload::prePushApply(WorkListItem& wl)
-{
-    if (applyCondition(wl)) {
-        float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-        float prop_float = writeToFloat<uint32_t>(wl.prop);
-        float delta = (temp_float - prop_float) / wl.degree;
-        uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-        wl.prop = wl.tempProp;
-        return std::make_tuple(delta_uint, true, true);
-    }
-    return std::make_tuple(0, false, false);
-}
-
-std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
-{
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, temp_float, wl.degree, wl.edgeIndex
-            );
-}
+// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
+//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
+// {
+//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
+// }
+
+// void
+// PRWorkload::init(PacketPtr pkt, int bit_index_base,
+//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
+//                 std::deque<int>& activeBits,
+//                 int& _workCount)
+// {
+//     WorkListItem items[numElementsPerLine];
+
+//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
+//     for (int i = 0; i < numElementsPerLine; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         if (items[i].degree > 0) {
+//             needsPush[bit_index_base + i] = 1;
+//             activeBits.push_back(bit_index_base + i);
+//             _workCount++;
+//         }
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = 1.0;
+
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::applyCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return dist >= threshold;
+// }
+
+// bool
+// PRWorkload::preWBApply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl) && (wl.degree > 0)) {
+//         return true;
+//     }
+//     return false;
+// }
+
+// std::tuple<uint32_t, bool, bool>
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     if (applyCondition(wl)) {
+//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//         float prop_float = writeToFloat<uint32_t>(wl.prop);
+//         float delta = (temp_float - prop_float) / wl.degree;
+//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//         wl.prop = wl.tempProp;
+//         return std::make_tuple(delta_uint, true, true);
+//     }
+//     return std::make_tuple(0, false, false);
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, temp_float, wl.degree, wl.edgeIndex
+//             );
+// }
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 6bbc4935c2..f71955bd16 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -34,6 +34,7 @@
 #include <tuple>
 
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "mem/packet.hh"
 
 
@@ -46,70 +47,54 @@ class GraphWorkload
     GraphWorkload() {}
     ~GraphWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount) = 0;
+    virtual void init(PacketPtr pkt, WorkDirectory* dir) = 0;
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
-    virtual bool applyCondition(WorkListItem wl) = 0;
-    virtual bool preWBApply(WorkListItem& wl) = 0;
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl) = 0;
+    virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool activeCondition(WorkListItem wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
 class BFSWorkload : public GraphWorkload
 {
   private:
-    uint64_t initAddrBase;
-    int initIndex;
+    uint64_t initAddr;
     uint32_t initValue;
-    int numElementsPerLine;
-    int atomSize;
 
   public:
-    BFSWorkload(uint64_t init_addr, uint32_t init_value, int atom_size);
+    BFSWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
 
     ~BFSWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-    int numElementsPerLine;
-    int atomSize;
-
-  public:
-    PRWorkload(float alpha, float threshold, int atom_size);
+//   public:
+//     PRWorkload(float alpha, float threshold);
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, int bit_index_base,
-                    std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-                    std::deque<int>& activeBits,
-                    int& _workCount);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual bool applyCondition(WorkListItem wl);
-    virtual bool preWBApply(WorkListItem& wl);
-    virtual std::tuple<uint32_t, bool, bool> prePushApply(WorkListItem& wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..0c21833a05 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 8ec9214b49..a447dedc3d 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,6 +27,7 @@
 
 from m5.params import *
 from m5.proxy import *
+from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -40,9 +41,13 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
-
+    active_buffer_size = Param.Int("Maximum number of memory active memory "
+                                "atoms ready to send updates. This parameter "
+                                "and post_push_wb_queue_size should be set "
+                                "in tandem. Probably, they should be equal.")
     post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
 
+    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/CoalesceEngine_bak.py b/src/accl/graph/sega/CoalesceEngine_bak.py
deleted file mode 100644
index 1fd3b968c5..0000000000
--- a/src/accl/graph/sega/CoalesceEngine_bak.py
+++ /dev/null
@@ -1,50 +0,0 @@
-# -*- coding: utf-8 -*-
-# Copyright (c) 2017 Jason Lowe-Power
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from m5.params import *
-from m5.proxy import *
-from m5.objects.BaseMemoryEngine import BaseMemoryEngine
-
-class CoalesceEngine(BaseMemoryEngine):
-    type = 'CoalesceEngine'
-    cxx_header = "accl/graph/sega/coalesce_engine.hh"
-    cxx_class = 'gem5::CoalesceEngine'
-
-    cache_size = Param.MemorySize("Size of the internal SRAM array.")
-
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
-    num_tgts_per_mshr = Param.Int("Number of Targets Per MSHR.")
-
-    max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
-                                "requestor in each cycle. Used to limit b/w.")
-
-    post_push_wb_queue_size = Param.Int("Maximum number of pending wb after "
-                                "apply process for applications that require "
-                                "the apply process to happen exactly before "
-                                "pushing the edgePointer to the PushEngine.")
-
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index 5d411be9ac..b3e1a838fb 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -37,6 +37,7 @@ SimObject("WLEngine.py", sim_objects=["WLEngine"])
 Source("base_memory_engine.cc")
 Source("centeral_controller.cc")
 Source("coalesce_engine.cc")
+Source("enums.cc")
 Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
@@ -45,10 +46,10 @@ DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
 DebugFlag("CacheBlockState")
 DebugFlag("CoalesceEngine")
-DebugFlag("FinalAnswer")
 DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
+DebugFlag("MSDebug")
 DebugFlag("WLEngine")
 
 CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
-                    "WLEngine", "BaseMemoryEngine"])
\ No newline at end of file
+                    "WLEngine", "BaseMemoryEngine"])
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fc2262e111..883992e64e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,6 +82,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
+        mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount()> 0)) {
             mpu->start();
         }
@@ -106,14 +107,14 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
-    workload = new BFSWorkload(init_addr, init_value, system->cacheLineSize());
+    workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
+// }
 
 void
 CenteralController::recvDoneSignal()
@@ -144,6 +145,7 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
+            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 9ddb1b35f0..6eb07dbcac 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -35,7 +35,6 @@
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
-#include "debug/FinalAnswer.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -64,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 66ff66c068..0aa61345f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,6 +34,7 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
+#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -42,26 +43,23 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
+    BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    maxRespPerCycle(params.max_resp_per_cycle), cacheWorkCount(0),
-    numPullsReceived(0), activeBufferSize(params.post_push_wb_queue_size),
+    maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
+    activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
-    pendingPullReads(0),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
     nextResponseEvent([this] {
         processNextResponseEvent();
         }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    nextPrePushApplyEvent([this] {
-        processNextPrePushApplyEvent();
-        }, name() + ".nextPrePushApplyEvent"),
+    nextApplyEvent([this] {
+        processNextApplyEvent();
+        }, name() + ".nextApplyEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -69,6 +67,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    activeBuffer.clear();
+    postPushWBQueue.clear();
 }
 
 void
@@ -85,7 +85,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // TODO: Check postPushWBQueue for hits
+        // FIXME: Check postPushWBQueue for hits
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -97,54 +97,70 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        // FIXME: Pass workdirectory to graphworkload.init
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
+        graphWorkload->init(pkt, directory);
+        if (pkt->getAddr() > lastAtomAddr) {
+            lastAtomAddr = pkt->getAddr();
+        }
         memPort.sendFunctional(pkt);
     }
 }
 
+void
+CoalesceEngine::postMemInitSetup()
+{
+    directory->setLastAtomAddr(lastAtomAddr);
+}
+
+void
+CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+{
+    directory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
 bool
 CoalesceEngine::done()
 {
-    // FIXME: Fix this later
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
+        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
 }
 
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
+bool
+CoalesceEngine::timeToPull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
 }
 
-// FIXME: This and the next function should be moved to the
-// WorkDirectory.
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
+bool
+CoalesceEngine::canSchedulePull()
 {
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
+    // TODO: Maybe a good idea to change this to
+    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
+    return pullsScheduled < 1;
 }
 
-// FIXME: Read FIXME: Above
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
+bool
+CoalesceEngine::workLeftInMem()
 {
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
+    return !directory->empty();
 }
 
 bool
+CoalesceEngine::pullCondition()
+{
+    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+}
+
+// addr should be aligned to peerMemoryAtomSize
+int
+CoalesceEngine::getBlockIndex(Addr addr)
+{
+    assert((addr % peerMemoryAtomSize) == 0);
+    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
+    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
+}
+
+ReadReturnStatus
 CoalesceEngine::recvWLRead(Addr addr)
 {
     Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
@@ -163,6 +179,9 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
+        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
+            return ReadReturnStatus::REJECT_NO_ROLL;
+        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
@@ -197,7 +216,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             schedule(nextResponseEvent, nextCycle());
         }
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
                 (cacheBlocks[block_index].state == CacheState::PENDING_DATA)) {
         // Hit under miss
@@ -207,7 +226,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(!cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
 
         assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
@@ -217,7 +235,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
         stats.numVertexReads++;
-        return true;
+        return ReadReturnStatus::ACCEPT;
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
@@ -232,20 +250,37 @@ CoalesceEngine::recvWLRead(Addr addr)
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 if (cacheBlocks[block_index].dirty) {
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                    cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
                             processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
+                    if ((!nextMemoryEvent.pending()) &&
+                        (!nextMemoryEvent.scheduled())) {
+                        schedule(nextMemoryEvent, nextCycle());
+                    }
                 } else {
-                    // NOTE: move the cache block to invalid state
-                    // FIXME: Fix the issue below.
-                    // May need to activate tracking for this
+                    // NOTE: The cache block could still be active but
+                    // not dirty. If active we only have to active tracking
+                    // but can throw the data away.
+                    bool atom_active = false;
+                    for (int index = 0; index < numElementsPerLine; index++) {
+                        atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                    }
+                    if (atom_active) {
+                        activeCacheBlocks.erase(block_index);
+                        directory->activate(cacheBlocks[block_index].addr);
+                    }
+                    // NOTE: Bring the cache line to invalid state.
+                    // NOTE: Above line where we set hasConflict to true
+                    // does not matter anymore since we reset the cache line.
                     cacheBlocks[block_index].reset();
                 }
+                return ReadReturnStatus::REJECT_NO_ROLL;
+            } else {
+                return ReadReturnStatus::REJECT_ROLL;
             }
-            // return int instead of bool to tell WLEngine to whether
-            // roll the first entry in the queue.
-            return false;
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
@@ -255,16 +290,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                 cacheBlocks[block_index].valid = false;
                 cacheBlocks[block_index].dirty = false;
                 cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].needsPreWBApply = false;
                 cacheBlocks[block_index].state = CacheState::PENDING_DATA;
                 cacheBlocks[block_index].lastChangedTick = curTick();
+
+                MSHR[block_index].push_back(addr);
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextRead(block_index, schedule_tick);
                     }, block_index, curTick());
-                return true;
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                return ReadReturnStatus::ACCEPT;
             } else {
-                return false;
+                return ReadReturnStatus::REJECT_ROLL;
             }
         }
     }
@@ -276,116 +316,87 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
     assert(pkt->isResponse());
     DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
                                                 __func__, pkt->print());
+
+    onTheFlyReqs--;
     if (pkt->isWrite()) {
         DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
         delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
+    } else {
+        assert(pkt->isRead());
+        Addr addr = pkt->getAddr();
+        int block_index = getBlockIndex(addr);
+        ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+
+        // NOTE: Regardless of where the pkt will go we have to release the
+        // reserved space for this pkt in the activeBuffer in case
+        // it was read from memory for placement in the activeBuffer.
+        // NOTE: Also we have to stop tracking the address for pullAddrs
+        if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+            pendingPullReads--;
+            pendingPullAddrs.erase(addr);
         }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
+        if (cacheBlocks[block_index].addr == addr) {
+            // If it is in the cache, line should be in PENDING_DATA state.
+            // Regardless of the purpose for which it was read, it should
+            // be placed in the cache array.
+            assert(cacheBlocks[block_index].busyMask == 0);
+            assert(!cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].dirty);
+            assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
+
+            // NOTE: Since it is in PENDING_DATA state it
+            // should have an entry in the MSHR.
+            assert(MSHR.find(block_index) != MSHR.end());
+
+            pkt->writeDataToBlock((uint8_t*) cacheBlocks[block_index].items,
+                                                            peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            // HACK: In case the pkt was read for push but it was allocated
+            // for in the cache later on, we should cancel the future
+            // processNextRead for this block. We could set lastChangedTick
+            // to curTick() like usual. However, there is no way to ensure
+            // that processNextRead will be not be called on the same tick
+            // as the pkt arrives from the memory. Therefore, we will set
+            // the lastChangedTick to half a cycle before the actual time.
+            // We move that back in time because it would be fine if
+            // processNextRead happened before pkt arriveed. processNextRead
+            // actually will check if there is a pending read for push for
+            // the address it's trying to populate.
+            if (purpose->dest() == ReadDestination::READ_FOR_PUSH) {
+                cacheBlocks[block_index].lastChangedTick =
+                                    curTick() - (Tick) (clockPeriod() / 2);
+            } else {
+                cacheBlocks[block_index].lastChangedTick = curTick();
+            }
 
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
+            // NOTE: If the atom is active we have to deactivate the tracking
+            // of this atom in the memory since it's not in memory anymore.
+            // Since it is going to the cache, cache will be responsible for
+            // tracking this. Push to activeCacheBlocks for simulator speed
+            // instead of having to search for active blocks in the cache.
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(
+                                            cacheBlocks[block_index].items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeCacheBlocks.push_back(block_index);
+            }
 
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
+            assert(MSHR.find(block_index) != MSHR.end());
+            for (auto it = MSHR[block_index].begin();
+                                            it != MSHR[block_index].end();) {
+                Addr miss_addr = *it;
+                Addr aligned_miss_addr =
+                            roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
 
-            if (aligned_miss_addr == addr) {
+                assert(aligned_miss_addr == cacheBlocks[block_index].addr);
                 int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
                 DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
                             "cacheBlocks[%d] can be serviced with the received "
                             "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
                 responseQueue.push_back(std::make_tuple(miss_addr,
                         cacheBlocks[block_index].items[wl_offset], curTick()));
                 DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
@@ -400,32 +411,72 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             graphWorkload->printWorkListItem(
                                 cacheBlocks[block_index].items[wl_offset]),
                             responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
                 cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
                 DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                             block_index, cacheBlocks[block_index].to_string());
                 it = MSHR[block_index].erase(it);
+            }
+            MSHR.erase(block_index);
+
+            cacheBlocks[block_index].state = CacheState::BUSY;
+            if ((!nextResponseEvent.scheduled()) && (!responseQueue.empty())) {
+                schedule(nextResponseEvent, nextCycle());
+            }
+            delete pkt;
+        } else {
+            assert(purpose->dest() == ReadDestination::READ_FOR_PUSH);
+            // There should be enough room in activeBuffer to place this pkt.
+            // REMEMBER: If dest == READ_FOR_PUSH we release the reserved space.
+            // So at this point in code we should have at least one free entry
+            // in the active buffer which is reserved for this pkt.
+            assert(activeBuffer.size() + pendingPullReads < activeBufferSize);
+
+            WorkListItem items[numElementsPerLine];
+            pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                atom_active |= graphWorkload->activeCondition(items[index]);
+            }
+            if (atom_active) {
+                directory->deactivate(addr);
+                activeBuffer.emplace_back(pkt, curTick());
+                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
             } else {
-                it++;
+                delete pkt;
+            }
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     if ((!nextMemoryEvent.pending()) &&
+            //         (!nextMemoryEvent.scheduled())) {
+            //         schedule(nextMemoryEvent, nextCycle());
+            //     }
+            //     pullsScheduled++;
+            // }
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
+                pullsScheduled++;
             }
         }
     }
 
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
+    if (done()) {
+        owner->recvDoneSignal();
     }
-
-    delete pkt;
     return true;
 }
 
-// TODO: For loop to empty the entire responseQueue.
 void
 CoalesceEngine::processNextResponseEvent()
 {
@@ -450,8 +501,8 @@ CoalesceEngine::processNextResponseEvent()
                     addr_response);
 
         responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
+        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue."
+                    " responseQueue.size = %d.\n", __func__,
                     responseQueue.size());
         DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
                     "responseQueue.size = %d.\n", __func__,
@@ -491,27 +542,28 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
                 "with Addr: %lu.\n", __func__,
                 graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
+
+    // NOTE: Design does not allow for write misses.
     assert(cacheBlocks[block_index].addr == aligned_addr);
     // cache state asserts
-    assert(cacheBlocks[block_index].valid);
     assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
+    assert(cacheBlocks[block_index].valid);
+    assert(cacheBlocks[block_index].state == CacheState::BUSY);
 
     // respective bit in busyMask for wl is set.
     assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
             (1 << wl_offset));
 
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
+        cacheBlocks[block_index].dirty |= true;
     }
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
+    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
+        (!activeCacheBlocks.find(block_index))) {
+        activeCacheBlocks.push_back(block_index);
+        if (!owner->running()) {
+            owner->start();
+        }
     }
 
     cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
@@ -523,188 +575,40 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                         block_index, cacheBlocks[block_index].to_string());
 
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = preWBApplyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the preWBApplyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-
-    if (cacheBlocks[block_index].state == CacheState::PENDING_PRE_WB_APPLY) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].needsPreWBApply);
-        bool block_active = false;
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool active = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            block_active |= active;
-            if (active) {
-                // cacheWorkCount++;
-                // FUTUREME: When pulling from activeCacheBlocks, in case we
-                // face a block that is not in idle state, we basically pop
-                // that entry and push it to the back. We only delete entries
-                // in this buffer if pushed or evicted.
-                activeCacheBlocks.push_back(block_index);
-            }
-        }
-        if (block_active && !owner->running()) {
-            owner->start();
-        }
-
-        cacheBlocks[block_index].needsPreWBApply = false;
+    if (cacheBlocks[block_index].busyMask == 0) {
         if (cacheBlocks[block_index].hasConflict) {
             if (cacheBlocks[block_index].dirty) {
+                cacheBlocks[block_index].state = CacheState::PENDING_WB;
+                cacheBlocks[block_index].lastChangedTick = curTick();
                 memoryFunctionQueue.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextWriteBack(block_index, schedule_tick);
                     }, block_index, curTick());
+                if ((!nextMemoryEvent.pending()) &&
+                    (!nextMemoryEvent.scheduled())) {
+                    schedule(nextMemoryEvent, nextCycle());
+                }
             } else {
-                // FIXME: Solve below issue.
-                // Not dirty but could be active still.
-                // need to activate tracking
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
+                }
+                if (atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                    directory->activate(cacheBlocks[block_index].addr);
+                }
                 cacheBlocks[block_index].reset();
             }
         } else {
             cacheBlocks[block_index].state = CacheState::IDLE;
-        }
-        cacheBlocks[block_index].lastChangedTick = curTick();
-    } else {
-
-    }
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
         }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
     }
-
-    if (done()) {
+    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
+                block_index, cacheBlocks[block_index].to_string());
+    stats.numVertexWrites++;
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
 }
@@ -740,6 +644,10 @@ CoalesceEngine::processNextMemoryEvent()
     if ((!memoryFunctionQueue.empty())) {
         schedule(nextMemoryEvent, nextCycle());
     }
+
+    if (done()) {
+        owner->recvDoneSignal();
+    }
 }
 
 void
@@ -759,36 +667,68 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
     assert(cacheBlocks[block_index].busyMask == 0);
     assert(!cacheBlocks[block_index].valid);
     assert(!cacheBlocks[block_index].dirty);
-    assert(!cacheBlocks[block_index].needsPreWBApply);
     assert(cacheBlocks[block_index].state == CacheState::PENDING_DATA);
 
     bool need_send_pkt = true;
 
     // NOTE: Search postPushWBQueue
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end();)
     {
         PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr = wb_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
             wb_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+            cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+
             need_send_pkt = false;
-            postPushWBQueue.erase(wb);
+            wb = postPushWBQueue.erase(wb);
+            delete wb_pkt;
+            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, postPushWBQueue.size());
+        } else {
+            wb++;
         }
     }
-    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+    // NOTE: Search activeBuffer
+    for (auto ab = activeBuffer.begin(); ab != activeBuffer.end();) {
         PacketPtr ab_pkt = std::get<0>(*ab);
-        if (cacheBlocks[block_index].addr = ab_pkt->getAddr()) {
+        if (cacheBlocks[block_index].addr == ab_pkt->getAddr()) {
             ab_pkt->writeDataToBlock(
                 (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
+
+            cacheBlocks[block_index].valid = true;
+            cacheBlocks[block_index].dirty = true;
+            cacheBlocks[block_index].lastChangedTick = curTick();
+            activeCacheBlocks.push_back(block_index);
+
             need_send_pkt = false;
-            activeBuffer.erase(ab);
+            ab = activeBuffer.erase(ab);
+            delete ab_pkt;
+            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+            //     memoryFunctionQueue.emplace_back(
+            //         [this] (int ignore, Tick schedule_tick) {
+            //             processNextVertexPull(ignore, schedule_tick);
+            //         }, 0, curTick());
+            //     pullsScheduled++;
+            // }
+            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        cacheBlocks[block_index].addr, activeBuffer.size());
+            if (pullCondition()) {
+                memoryFunctionQueue.emplace_back(
+                    [this] (int ignore, Tick schedule_tick) {
+                        processNextVertexPull(ignore, schedule_tick);
+                    }, 0, curTick());
+                pullsScheduled++;
+            }
+        } else {
+            ab++;
         }
     }
     if (!need_send_pkt) {
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsPreWBApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
         for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
             Addr miss_addr = *it;
             Addr aligned_miss_addr =
@@ -828,14 +768,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
         cacheBlocks[block_index].state = CacheState::BUSY;
     }
 
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-                                                pendingVertexPullReads.end()) {
+    if (pendingPullAddrs.find(cacheBlocks[block_index].addr) !=
+                                            pendingPullAddrs.end()) {
         need_send_pkt = false;
     }
 
     if (need_send_pkt) {
         PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
                                         peerMemoryAtomSize);
+        ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_CACHE);
+        pkt->pushSenderState(purpose);
         DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
                 "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
@@ -852,25 +794,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 block_index, cacheBlocks[block_index].to_string());
 
     if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].busyMask == 0);
+        assert(cacheBlocks[block_index].valid);
         assert(cacheBlocks[block_index].dirty);
         assert(cacheBlocks[block_index].hasConflict);
-        assert(!cacheBlocks[block_index].needsPreWBApply);
         assert(cacheBlocks[block_index].state == CacheState::PENDING_WB);
 
-        Addr base_addr = cacheBlocks[block_index].addr;
+        // NOTE: If the atom we're writing back is active, we have to
+        // stop tracking it in the cache and start tracking it in the memory.
+        bool atom_active = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            if (cacheBlocks[block_index].items[index].active) {
-                Addr vertex_addr = base_addr + index * sizeof(WorkListItem);
-                // NOTE: Implement this
-                // workdir.activate()
-                // cacheWorkCount--;
-            }
+            atom_active |= graphWorkload->activeCondition(
+                                        cacheBlocks[block_index].items[index]);
         }
-        if (activeCacheBlocks.find(block_index)) {
+        if (atom_active) {
             activeCacheBlocks.erase(block_index);
+            directory->activate(cacheBlocks[block_index].addr);
         }
+
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
                 (uint8_t*) cacheBlocks[block_index].items);
@@ -878,9 +819,8 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         memPort.sendPacket(pkt);
+        onTheFlyReqs++;
         cacheBlocks[block_index].reset();
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
     } else {
@@ -896,94 +836,54 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
+    if (postPushWBQueue.empty()) {
+        return;
+    }
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
     if (schedule_tick == pkt_tick) {
         memPort.sendPacket(wb_pkt);
+        onTheFlyReqs++;
         postPushWBQueue.pop_front();
+        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
+                        "postPushWBQueue.size: %d.\n", __func__,
+                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
+    pullsScheduled--;
+    if (!directory->empty()) {
+        Addr addr = directory->getNextWork();
+        int block_index = getBlockIndex(addr);
+
+        bool in_cache = cacheBlocks[block_index].addr == addr;
+        bool in_active_buffer = false;
+        for (auto ab = activeBuffer.begin(); ab != activeBuffer.end(); ab++) {
+            PacketPtr pkt = std::get<0>(*ab);
+            in_active_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
+        bool in_write_buffer = false;
+        for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
+        {
+            PacketPtr pkt = std::get<0>(*wb);
+            in_write_buffer |= (pkt->getAddr() == addr);
         }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
+        bool repeat_work = pendingPullAddrs.find(addr) != pendingPullAddrs.end();
+
+        if (!in_cache && !in_active_buffer && !in_write_buffer && !repeat_work) {
+            PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
+            ReadPurpose* purpose = new ReadPurpose(ReadDestination::READ_FOR_PUSH);
+            pkt->pushSenderState(purpose);
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+            pendingPullReads++;
+            pendingPullAddrs.insert(addr);
         }
     }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
 }
 
 void
@@ -1000,26 +900,149 @@ CoalesceEngine::recvMemRetry()
     schedule(nextMemoryEvent, nextCycle());
 }
 
+int
+CoalesceEngine::workCount()
+{
+    return activeCacheBlocks.size() +
+            directory->workCount() + activeBuffer.size();
+}
+
 void
 CoalesceEngine::recvVertexPull()
 {
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
+    pullsReceived++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
 
     stats.verticesPulled++;
     stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
+    if (!nextApplyEvent.scheduled()) {
+        schedule(nextApplyEvent, nextCycle());
+    }
+}
+
+void
+CoalesceEngine::processNextApplyEvent()
+{
+    if ((!activeBuffer.empty()) &&
+        (postPushWBQueue.size() < postPushWBQueueSize)) {
+        PacketPtr pkt;
+        Tick entrance_tick;
+        WorkListItem items[numElementsPerLine];
+
+        std::tie(pkt, entrance_tick) = activeBuffer.front();
+        pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+            if (graphWorkload->activeCondition(items[index])) {
+                Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
+                uint32_t delta = graphWorkload->apply(items[index]);
+                owner->recvVertexPush(addr, delta, items[index].edgeIndex,
+                                                    items[index].degree);
+                pullsReceived--;
+            }
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
+
+        bool atom_active = false;
+        for (int index = 0; index < numElementsPerLine; index++) {
+            atom_active |= graphWorkload->activeCondition(items[index]);
+        }
+        // NOTE: If the atom is not active anymore.
+        if (!atom_active) {
+            PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
+                                        peerMemoryAtomSize, (uint8_t*) items);
+            postPushWBQueue.emplace_back(wb_pkt, curTick());
+            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
+                            "postPushWBQueue.size: %d.\n", __func__,
+                            wb_pkt->print(), postPushWBQueue.size());
+            activeBuffer.pop_front();
+            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
+                        "activeBuffer.size: %d.\n", __func__,
+                        pkt->print(), activeBuffer.size());
+            memoryFunctionQueue.emplace_back(
+                [this] (int ignore, Tick schedule_tick) {
+                    processNextPostPushWB(ignore, schedule_tick);
+                }, 0, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
+            }
+            delete pkt;
+        }
+    } else if (!activeCacheBlocks.empty()) {
+        int num_visited_indices = 0;
+        int initial_fifo_length = activeCacheBlocks.size();
+        while (true) {
+            int block_index = activeCacheBlocks.front();
+            if (cacheBlocks[block_index].state == CacheState::IDLE) {
+                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                        Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
+                        uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].dirty = true;
+                        owner->recvVertexPush(addr, delta,
+                            cacheBlocks[block_index].items[index].edgeIndex,
+                            cacheBlocks[block_index].items[index].degree);
+                        pullsReceived--;
+                    }
+                }
+
+                bool atom_active = false;
+                for (int index = 0; index < numElementsPerLine; index++) {
+                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                }
+                // NOTE: If we have reached the last item in the cache block
+                if (!atom_active) {
+                    activeCacheBlocks.erase(block_index);
+                }
+                break;
+            }
+            // NOTE: If the block with index at the front of activeCacheBlocks
+            // is not in IDLE state, then roll the that index to the back
+            activeCacheBlocks.pop_front();
+            activeCacheBlocks.push_back(block_index);
+            // NOTE: If we have visited all the items initially in the FIFO.
+            num_visited_indices++;
+            if (num_visited_indices == initial_fifo_length) {
+                break;
+            }
+        }
+    } else {
+        DPRINTF(CoalesceEngine, "%s: Could not find "
+                        "work to apply.\n", __func__);
+    }
+
+    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
+    //     memoryFunctionQueue.emplace_back(
+    //         [this] (int ignore, Tick schedule_tick) {
+    //             processNextVertexPull(ignore, schedule_tick);
+    //         }, 0, curTick());
+    //     if ((!nextMemoryEvent.pending()) &&
+    //         (!nextMemoryEvent.scheduled())) {
+    //         schedule(nextMemoryEvent, nextCycle());
+    //     }
+    //     pullsScheduled++;
+    // }
+    if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
+            [this] (int ignore, Tick schedule_tick) {
+                processNextVertexPull(ignore, schedule_tick);
+            }, 0, curTick());
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
         }
+        pullsScheduled++;
+    }
+
+    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+        schedule(nextApplyEvent, nextCycle());
     }
 }
 
+
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
     coalesce(_coalesce),
@@ -1036,16 +1059,11 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache hit under misses."),
     ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
              "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1054,13 +1072,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
     ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
              "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1083,12 +1096,6 @@ CoalesceEngine::CoalesceStats::regStats()
 {
     using namespace statistics;
 
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
     hitRate = (readHits + readHitUnderMisses) /
                 (readHits + readHitUnderMisses + readMisses);
 
@@ -1096,7 +1103,6 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
     bitvectorLength.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8da67c7b43..c457b214f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,83 +29,20 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
-#include <bitset>
-
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
+#include "accl/graph/sega/work_directory.hh"
 #include "base/cprintf.hh"
 #include "base/statistics.hh"
 #include "params/CoalesceEngine.hh"
 
-
-
 namespace gem5
 {
 
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-enum CacheState
-{
-    INVALID,
-    PENDING_DATA,
-    BUSY,
-    IDLE,
-    PENDING_PRE_WB_APPLY,
-    PENDING_WB,
-    PENDING_PRE_PUSH_APPLY,
-    NUM_CACHE_STATE
-};
-
-const char* cacheStateStrings[NUM_CACHE_STATE] = {
-    "INVALID",
-    "PENDING_DATA",
-    "BUSY",
-    "IDLE",
-    "PENDING_PRE_WB_APPLY",
-    "PENDING_WB",
-    "PENDING_PRE_PUSH_APPLY"
-};
-
-enum ReadDestination
-{
-    READ_FOR_CACHE,
-    READ_FOR_PUSH
-};
-
 class MPU;
 
-
-// TODO: Add active bit to WorkListItem class. Check active bit before activate
-// Only activate if necessary and not active before.
-class WorkDirectory
-{
-  private:
-    Addr memoryAtomSize;
-    int atomBlockSize;
-    size_t elementSize;
-
-    int _workCount;
-  public:
-    AddrRange memoryRange;
-    WorkDirectory(Addr atom_size, int block_size, size_t element_size):
-        memoryAtomSize(atom_size), atomBlockSize(block_size),
-        elementSize(element_size), _workCount(0)
-    {}
-
-    void activate(Addr addr);
-    void deactivate(Addr addr);
-    int workCount();
-    std::tuple<WorkLocation, Addr> getNextWork();
-};
-
 class CoalesceEngine : public BaseMemoryEngine
 {
   private:
@@ -117,7 +54,6 @@ class CoalesceEngine : public BaseMemoryEngine
         bool valid;
         bool dirty;
         bool hasConflict;
-        bool needsPreWBApply;
         CacheState state;
         Tick lastChangedTick;
         Block() {}
@@ -127,7 +63,6 @@ class CoalesceEngine : public BaseMemoryEngine
           valid(false),
           dirty(false),
           hasConflict(false),
-          needsPreWBApply(false),
           state(CacheState::INVALID),
           lastChangedTick(0)
         {
@@ -140,18 +75,15 @@ class CoalesceEngine : public BaseMemoryEngine
             valid = false;
             dirty = false;
             hasConflict = false;
-            needsPreWBApply = false;
             state = CacheState::INVALID;
             lastChangedTick = 0;
         }
 
         std::string to_string() {
             return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "dirty: %s, hasConflict: %s, needsPreWBApply: %s"
-                "state: %s, lastChangedTick: %lu}", addr, busyMask,
-                valid ? "true" : "false", dirty ? "true" : "false",
-                hasConflict ? "true" : "false",
-                needsPreWBApply ? "true" : "false",
+                "dirty: %s, hasConflict: %s, state: %s, lastChangedTick: %lu}",
+                addr, busyMask, valid ? "true" : "false",
+                dirty ? "true" : "false", hasConflict ? "true" : "false",
                 cacheStateStrings[state], lastChangedTick);
         }
     };
@@ -164,8 +96,11 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
+    WorkDirectory* directory;
     GraphWorkload* graphWorkload;
 
+    Addr lastAtomAddr;
+
     int numLines;
     int numElementsPerLine;
     Block* cacheBlocks;
@@ -179,26 +114,26 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     // Tracking work in cache
-    int cacheWorkCount;
-    int numPullsReceived;
-    UniqueFIFO<int> preWBApplyQueue;
+    int pullsReceived;
     // NOTE: Remember to erase from this upon eviction from cache
     UniqueFIFO<int> activeCacheBlocks;
 
+    int pullsScheduled;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
+    std::unordered_set<Addr> pendingPullAddrs;
 
     int activeBufferSize;
     int postPushWBQueueSize;
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
+    bool timeToPull();
+    bool canSchedulePull();
+    bool workLeftInMem();
+    bool pullCondition();
     int getBlockIndex(Addr addr);
-    // TODO: Should be moved to WorkDirectory
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -212,11 +147,8 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
 
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    EventFunctionWrapper nextPrePushApplyEvent;
-    void processNextPrePushApplyEvent();
+    EventFunctionWrapper nextApplyEvent;
+    void processNextApplyEvent();
 
     struct CoalesceStats : public statistics::Group
     {
@@ -236,19 +168,14 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
         statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
         statistics::Scalar numInvalidWriteBacks;
 
-        statistics::Vector bitvectorSearchStatus;
-
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
@@ -272,12 +199,14 @@ class CoalesceEngine : public BaseMemoryEngine
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
     virtual void recvFunctional(PacketPtr pkt);
 
-    bool recvWLRead(Addr addr);
+    void postMemInitSetup();
+
+    void createPopCountDirectory(int atoms_per_block);
+
+    ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
-    // FIXME: Update this to return sum of cacheWorkCount and WorkDirectory
-    // workcount.
-    int workCount() { return _workCount; }
+    int workCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/coalesce_engine_bak.cc b/src/accl/graph/sega/coalesce_engine_bak.cc
deleted file mode 100644
index 7a064c1c2f..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.cc
+++ /dev/null
@@ -1,1308 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#include "accl/graph/sega/coalesce_engine.hh"
-
-#include <bitset>
-
-#include "accl/graph/sega/mpu.hh"
-#include "base/intmath.hh"
-#include "debug/CacheBlockState.hh"
-#include "debug/CoalesceEngine.hh"
-#include "debug/SEGAStructureSize.hh"
-#include "mem/packet_access.hh"
-#include "sim/sim_exit.hh"
-
-namespace gem5
-{
-
-CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params),
-    numLines((int) (params.cache_size / peerMemoryAtomSize)),
-    numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
-    numTgtsPerMSHR(params.num_tgts_per_mshr),
-    maxRespPerCycle(params.max_resp_per_cycle), _workCount(0),
-    numPullsReceived(0), postPushWBQueueSize(params.post_push_wb_queue_size),
-    maxPotentialPostPushWB(0),
-    nextMemoryEvent([this] {
-        processNextMemoryEvent();
-        }, name() + ".nextMemoryEvent"),
-    nextResponseEvent([this] {
-        processNextResponseEvent();
-        }, name() + ".nextResponseEvent"),
-    nextPreWBApplyEvent([this] {
-        processNextPreWBApplyEvent();
-        }, name() + ".nextPreWBApplyEvent"),
-    stats(*this)
-{
-    assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
-    cacheBlocks = new Block [numLines];
-    for (int i = 0; i < numLines; i++) {
-        cacheBlocks[i] = Block(numElementsPerLine);
-    }
-    needsPush.reset();
-}
-
-void
-CoalesceEngine::registerMPU(MPU* mpu)
-{
-    owner = mpu;
-}
-
-void
-CoalesceEngine::recvFunctional(PacketPtr pkt)
-{
-    if (pkt->isRead()) {
-        assert(pkt->getSize() == peerMemoryAtomSize);
-        Addr addr = pkt->getAddr();
-        int block_index = getBlockIndex(addr);
-
-        if ((cacheBlocks[block_index].addr == addr) &&
-            (cacheBlocks[block_index].valid)) {
-            assert(cacheBlocks[block_index].busyMask == 0);
-            assert(!cacheBlocks[block_index].needsApply);
-            // NOTE: No need to check needsWB because there might be entries
-            // that have been updated and not written back in the cache.
-            // assert(!cacheBlocks[block_index].needsWB);
-            assert(!cacheBlocks[block_index].pendingApply);
-            assert(!cacheBlocks[block_index].pendingWB);
-
-            pkt->makeResponse();
-            pkt->setDataFromBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-        } else {
-            memPort.sendFunctional(pkt);
-        }
-    } else {
-        // TODO: Add and implement init function for GraphWorkload.
-        int bit_index_base = getBitIndexBase(pkt->getAddr());
-        graphWorkload->init(pkt, bit_index_base, needsPush, activeBits, _workCount);
-        memPort.sendFunctional(pkt);
-    }
-}
-
-bool
-CoalesceEngine::done()
-{
-    return applyQueue.empty() && needsPush.none() &&
-        memoryFunctionQueue.empty() && (onTheFlyReqs == 0);
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBlockIndex(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    return ((int) (trimmed_addr / peerMemoryAtomSize)) % numLines;
-}
-
-// addr should be aligned to peerMemoryAtomSize
-int
-CoalesceEngine::getBitIndexBase(Addr addr)
-{
-    assert((addr % peerMemoryAtomSize) == 0);
-    Addr trimmed_addr = peerMemoryRange.removeIntlvBits(addr);
-    int atom_index = (int) (trimmed_addr / peerMemoryAtomSize);
-    int block_bits = (int) (peerMemoryAtomSize / sizeof(WorkListItem));
-    return atom_index * block_bits;
-}
-
-// index should be aligned to (peerMemoryAtomSize / sizeof(WorkListItem))
-Addr
-CoalesceEngine::getBlockAddrFromBitIndex(int index)
-{
-    assert((index % ((int) (peerMemoryAtomSize / sizeof(WorkListItem)))) == 0);
-    Addr trimmed_addr = index * sizeof(WorkListItem);
-    return peerMemoryRange.addIntlvBits(trimmed_addr);
-}
-
-bool
-CoalesceEngine::recvWLRead(Addr addr)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    assert(aligned_addr % peerMemoryAtomSize == 0);
-    int block_index = getBlockIndex(aligned_addr);
-    assert(block_index < numLines);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    assert(wl_offset < numElementsPerLine);
-    DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
-                        "This request maps to cacheBlocks[%d], aligned_addr: "
-                        "%lu, and wl_offset: %d.\n", __func__, addr,
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    if ((cacheBlocks[block_index].addr == aligned_addr) &&
-        (cacheBlocks[block_index].valid)) {
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
-        stats.readHits++;
-        assert(!cacheBlocks[block_index].pendingData);
-        // No cache block could be in pendingApply and pendingWB at the
-        // same time.
-        assert(!(cacheBlocks[block_index].pendingApply &&
-                cacheBlocks[block_index].pendingWB));
-        // Hit
-        // TODO: Add a hit latency as a param for this object.
-        // Can't just schedule the nextResponseEvent for latency cycles in
-        // the future.
-        responseQueue.push_back(std::make_tuple(
-            addr, cacheBlocks[block_index].items[wl_offset], curTick()));
-
-        DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                        cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                "to responseQueue. responseQueue.size = %d.\n",
-                __func__, addr,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]),
-                responseQueue.size());
-        // TODO: Stat to count the number of WLItems that have been touched.
-        cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-        // If they are scheduled for apply and WB those schedules should be
-        // discarded. Since there is no easy way to take items out of the
-        // function queue. Those functions check for their respective bits
-        // and skip the process if the respective bit is set to false.
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        // HACK: If a read happens on the same cycle as another operation such
-        // as apply set lastChangedTick to half a cycle later so that operation
-        // scheduled by the original operation (apply in this example) are
-        // invalidated. For more details refer to "accl/graph/sega/busyMaskErr"
-        cacheBlocks[block_index].lastChangedTick =
-                                    curTick() + (Tick) (clockPeriod() / 2);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-
-        if (!nextResponseEvent.scheduled()) {
-            schedule(nextResponseEvent, nextCycle());
-        }
-        stats.numVertexReads++;
-        return true;
-    } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
-                (cacheBlocks[block_index].pendingData)) {
-        // Hit under miss
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit under miss.\n",
-                                                        __func__, addr);
-        stats.readHitUnderMisses++;
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-        assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-        if (MSHR[block_index].size() == numTgtsPerMSHR) {
-            DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                        "cacheBlocks[%d]. Rejecting request.\n",
-                                        __func__, block_index);
-            stats.mshrTargetShortage++;
-            return false;
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: MSHR entries are available for "
-                            "cacheBlocks[%d].\n", __func__, block_index);
-        }
-        MSHR[block_index].push_back(addr);
-        stats.mshrEntryLength.sample(MSHR[block_index].size());
-        DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                "for cacheBlocks[%d].\n", __func__, addr, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-        stats.numVertexReads++;
-        return true;
-    } else {
-        // miss
-        // FIXME: Make this assert work. It will break if the cache block
-        // is cold and addr or aligned_addr is 0. It fails because cache block
-        // addr field is initialized to 0. Unfortunately Addr type is unsigned.
-        // So you can not initialized addr to -1.
-        assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
-        DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-        if (MSHR.find(block_index) == MSHR.end()) {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for Addr:"
-                    " %lu not found in MSHRs.\n", __func__, block_index, addr);
-            if (MSHR.size() == numMSHREntries) {
-                // Out of MSHR entries
-                DPRINTF(CoalesceEngine,  "%s: Out of MSHR entries. "
-                                "Rejecting request.\n", __func__);
-                // TODO: Break out read rejections into more than one stat
-                // based on the cause of the rejection
-                stats.mshrEntryShortage++;
-                return false;
-            } else {
-                DPRINTF(CoalesceEngine,  "%s: MSHR "
-                    "entries available.\n", __func__);
-                if ((cacheBlocks[block_index].valid) ||
-                    (cacheBlocks[block_index].pendingData)) {
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has a conflict "
-                                "with Addr: %lu.\n", __func__, addr,
-                                cacheBlocks[block_index].addr);
-                    if ((cacheBlocks[block_index].valid) &&
-                        (cacheBlocks[block_index].busyMask == 0) &&
-                        (!cacheBlocks[block_index].pendingApply) &&
-                        (!cacheBlocks[block_index].pendingWB)) {
-                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                                    "idle state.\n", __func__, block_index);
-                        // We're in idle state
-                        // Idle: valid && !pendingApply && !pendingWB;
-                        // Note 0: needsApply has to be false. Because
-                        // A cache line enters the idle state from two
-                        // other states. First a busy state that does not
-                        // need apply (needsApply is already false) or
-                        // from pendingApplyState after being applied which
-                        // clears the needsApply bit. needsApply is useful
-                        // when a cache block has transitioned from
-                        // pendingApply to busy without the apply happening.
-                        // Note 1: pendingData does not have to be evaluated
-                        // becuase pendingData is cleared when data
-                        // arrives from the memory and valid does not
-                        // denote cleanliness of the line. Rather it
-                        // is used to differentiate between empty blocks
-                        // and the blocks that have data from memory.
-                        // pendingData denotes the transient state between
-                        // getting a miss and getting the data for that miss.
-                        // valid basically means that the data in the cache
-                        // could be used to respond to read/write requests.
-                        assert(!cacheBlocks[block_index].needsApply);
-                        assert(!cacheBlocks[block_index].pendingData);
-                        // There are no conflicts in idle state.
-                        assert(MSHR.find(block_index) == MSHR.end());
-                        if (cacheBlocks[block_index].needsWB) {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs"
-                            "to be written back.\n", __func__, block_index);
-                            cacheBlocks[block_index].pendingWB = true;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                processNextWriteBack(block_index, schedule_tick);
-                            }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextWriteBack for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        } else {
-                            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does "
-                                            "not need to be written back.\n",
-                                                        __func__, block_index);
-                            cacheBlocks[block_index].addr = aligned_addr;
-                            cacheBlocks[block_index].valid = false;
-                            cacheBlocks[block_index].busyMask = 0;
-                            cacheBlocks[block_index].needsWB = false;
-                            cacheBlocks[block_index].needsApply = false;
-                            cacheBlocks[block_index].pendingData = true;
-                            cacheBlocks[block_index].pendingApply = false;
-                            cacheBlocks[block_index].pendingWB = false;
-                            cacheBlocks[block_index].lastChangedTick = curTick();
-                            memoryFunctionQueue.emplace_back(
-                                [this] (int block_index, Tick schedule_tick) {
-                                    processNextRead(block_index, schedule_tick);
-                                }, block_index, curTick());
-                            DPRINTF(CoalesceEngine, "%s: Pushed "
-                                        "processNextRead for input "
-                                        "%d to memoryFunctionQueue.\n",
-                                        __func__, block_index);
-                            if ((!nextMemoryEvent.pending()) &&
-                                (!nextMemoryEvent.scheduled())) {
-                                schedule(nextMemoryEvent, nextCycle());
-                            }
-                            DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: "
-                                    "%s.\n", __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                        }
-                    }
-                    // cacheBlocks[block_index].hasConflict = true;
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    stats.readMisses++;
-                    // TODO: Add readConflicts here.
-                    stats.numVertexReads++;
-                    return true;
-                } else {
-                    // MSHR available and no conflict
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu has no conflict. "
-                                            "Allocating a cache line for it.\n"
-                                                            , __func__, addr);
-                    assert(!cacheBlocks[block_index].valid);
-                    assert(cacheBlocks[block_index].busyMask == 0);
-                    assert(!cacheBlocks[block_index].needsWB);
-                    assert(!cacheBlocks[block_index].needsApply);
-                    assert(!cacheBlocks[block_index].pendingData);
-                    assert(!cacheBlocks[block_index].pendingApply);
-                    assert(!cacheBlocks[block_index].pendingWB);
-                    assert(MSHR[block_index].size() == 0);
-
-                    cacheBlocks[block_index].addr = aligned_addr;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CoalesceEngine, "%s: Allocated cacheBlocks[%d] for"
-                                " Addr: %lu.\n", __func__, block_index, addr);
-                    MSHR[block_index].push_back(addr);
-                    stats.mshrEntryLength.sample(MSHR[block_index].size());
-                    DPRINTF(CoalesceEngine, "%s: Added Addr: %lu to targets "
-                        "for cacheBlocks[%d].\n", __func__, addr, block_index);
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for "
-                                        "input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-                                    __func__, block_index,
-                                    cacheBlocks[block_index].to_string());
-                    stats.readMisses++;
-                    stats.numVertexReads++;
-                    return true;
-                }
-            }
-        } else {
-            DPRINTF(CoalesceEngine,  "%s: Respective cacheBlocks[%d] for "
-                "Addr: %lu already in MSHRs. It has a conflict "
-                "with addr: %lu.\n", __func__, block_index, addr,
-                                cacheBlocks[block_index].addr);
-            assert(MSHR[block_index].size() <= numTgtsPerMSHR);
-            assert(MSHR[block_index].size() > 0);
-            if (MSHR[block_index].size() == numTgtsPerMSHR) {
-                DPRINTF(CoalesceEngine,  "%s: Out of targets for "
-                            "cacheBlocks[%d]. Rejecting request.\n",
-                                            __func__, block_index);
-                stats.mshrTargetShortage++;
-                return false;
-            }
-            DPRINTF(CoalesceEngine, "%s: There is room for another target "
-                            "for cacheBlocks[%d].\n", __func__, block_index);
-
-            // TODO: Might want to differentiate between different misses.
-            stats.readMisses++;
-
-            MSHR[block_index].push_back(addr);
-            stats.mshrEntryLength.sample(MSHR[block_index].size());
-            DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to targets for "
-                            "cacheBlocks[%d].\n", __func__, addr, block_index);
-            stats.numVertexReads++;
-            return true;
-        }
-    }
-}
-
-bool
-CoalesceEngine::handleMemResp(PacketPtr pkt)
-{
-    assert(pkt->isResponse());
-    DPRINTF(CoalesceEngine,  "%s: Received packet: %s from memory.\n",
-                                                __func__, pkt->print());
-    if (pkt->isWrite()) {
-        DPRINTF(CoalesceEngine, "%s: Dropped the write response.\n", __func__);
-        delete pkt;
-        return true;
-    }
-
-    onTheFlyReqs--;
-    Addr addr = pkt->getAddr();
-    int block_index = getBlockIndex(addr);
-    WorkListItem* items = pkt->getPtr<WorkListItem>();
-
-    bool do_wb = false;
-    if (pkt->findNextSenderState<SenderState>()) {
-        assert(!((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid)));
-        // We have read the address to send the wl and it is not in the
-        // cache. Simply send the items to the PushEngine.
-
-        DPRINTF(CoalesceEngine, "%s: Received read response for pull read "
-                                "for addr %lu.\n", __func__, addr);
-        int it = getBitIndexBase(addr);
-        uint64_t send_mask = pendingVertexPullReads[addr];
-        // No applying of the line needed.
-        for (int i = 0; i < numElementsPerLine; i++) {
-            Addr vertex_addr = addr + i * sizeof(WorkListItem);
-            uint64_t vertex_send_mask = send_mask & (1 << i);
-            if (vertex_send_mask != 0) {
-                assert(needsPush[it + i] == 1);
-                needsPush[it + i] = 0;
-                _workCount--;
-
-                uint32_t delta;
-                bool do_push, do_wb_v;
-                std::tie(delta, do_push, do_wb_v) =
-                                        graphWorkload->prePushApply(items[i]);
-                do_wb |= do_wb_v;
-                if (do_push) {
-                    owner->recvVertexPush(vertex_addr, delta,
-                                        items[i].edgeIndex, items[i].degree);
-                } else {
-                    // TODO: Add a stat to count this.
-                    owner->recvPrevPullCorrection();
-                }
-                stats.verticesPushed++;
-                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            }
-        }
-        pendingVertexPullReads.erase(addr);
-        maxPotentialPostPushWB--;
-    }
-
-    bool cache_wb = false;
-    if (cacheBlocks[block_index].addr == addr) {
-        DPRINTF(CoalesceEngine, "%s: Received read response to "
-                        "fill cacheBlocks[%d].\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-        assert(!cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(!cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(!cacheBlocks[block_index].pendingWB);
-        assert(MSHR.find(block_index) != MSHR.end());
-        std::memcpy(cacheBlocks[block_index].items, items, peerMemoryAtomSize);
-        for (int i = 0; i < numElementsPerLine; i++) {
-            DPRINTF(CoalesceEngine,  "%s: Wrote cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, i, graphWorkload->printWorkListItem(
-                                        cacheBlocks[block_index].items[i]));
-        }
-        cacheBlocks[block_index].valid = true;
-        cacheBlocks[block_index].needsWB |= do_wb;
-        cacheBlocks[block_index].pendingData = false;
-        // HACK: In case processNextRead is called on the same tick as curTick
-        // and is scheduled to read to the same cacheBlocks[block_index]
-        cacheBlocks[block_index].lastChangedTick =
-                                        curTick() - (Tick) (clockPeriod() / 2);
-        cache_wb = true;
-    } else if (do_wb) {
-        PacketPtr wb_pkt = createWritePacket(
-                                addr, peerMemoryAtomSize, (uint8_t*) items);
-        postPushWBQueue.emplace_back(wb_pkt, curTick());
-        memoryFunctionQueue.emplace_back(
-            [this] (int ignore, Tick schedule_tick) {
-                processNextPostPushWB(ignore, schedule_tick);
-            }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    } else {
-        // TODO: Add a stat to count this.
-        // FIXME: This is not a totally wasteful read. e.g. all reads
-        // for pull in BFS are like this.
-        DPRINTF(CoalesceEngine, "%s: No write destination for addr: %lu.\n", __func__, addr);
-    }
-
-    if (cache_wb) {
-        for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-            Addr miss_addr = *it;
-            Addr aligned_miss_addr =
-                roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-            if (aligned_miss_addr == addr) {
-                int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                            "cacheBlocks[%d] can be serviced with the received "
-                            "packet.\n",__func__, miss_addr, block_index);
-                // TODO: Make this block of code into a function
-                responseQueue.push_back(std::make_tuple(miss_addr,
-                        cacheBlocks[block_index].items[wl_offset], curTick()));
-                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, miss_addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                            "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, addr,
-                            graphWorkload->printWorkListItem(
-                                cacheBlocks[block_index].items[wl_offset]),
-                            responseQueue.size());
-                // TODO: Add a stat to count the number of WLItems that have been touched.
-                cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                // cacheBlocks[block_index].lastChangedTick = curTick();
-                DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                            block_index, cacheBlocks[block_index].to_string());
-                it = MSHR[block_index].erase(it);
-            } else {
-                it++;
-            }
-        }
-    }
-
-    if (MSHR[block_index].empty()) {
-        MSHR.erase(block_index);
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-
-
-    // TODO: Probably check for done here too.
-    delete pkt;
-    return true;
-}
-
-// TODO: For loop to empty the entire responseQueue.
-void
-CoalesceEngine::processNextResponseEvent()
-{
-    int num_responses_sent = 0;
-
-    Addr addr_response;
-    WorkListItem worklist_response;
-    Tick response_queueing_tick;
-    while(true) {
-        std::tie(addr_response, worklist_response, response_queueing_tick) =
-                                                        responseQueue.front();
-        Tick waiting_ticks = curTick() - response_queueing_tick;
-        if (ticksToCycles(waiting_ticks) < 1) {
-            break;
-        }
-        owner->handleIncomingWL(addr_response, worklist_response);
-        num_responses_sent++;
-        DPRINTF(CoalesceEngine,
-                    "%s: Sent WorkListItem: %s with addr: %lu to WLEngine.\n",
-                    __func__,
-                    graphWorkload->printWorkListItem(worklist_response),
-                    addr_response);
-
-        responseQueue.pop_front();
-        DPRINTF(SEGAStructureSize,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        DPRINTF(CoalesceEngine,  "%s: Popped a response from responseQueue. "
-                    "responseQueue.size = %d.\n", __func__,
-                    responseQueue.size());
-        stats.responseQueueLatency.sample(
-                                    waiting_ticks * 1e9 / getClockFrequency());
-        if (num_responses_sent >= maxRespPerCycle) {
-            if (!responseQueue.empty()) {
-                stats.responsePortShortage++;
-            }
-            break;
-        }
-        if (responseQueue.empty()) {
-            break;
-        }
-    }
-
-    if ((!nextResponseEvent.scheduled()) &&
-        (!responseQueue.empty())) {
-        schedule(nextResponseEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
-{
-    Addr aligned_addr = roundDown<Addr, size_t>(addr, peerMemoryAtomSize);
-    int block_index = getBlockIndex(aligned_addr);
-    int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
-    DPRINTF(CoalesceEngine,  "%s: Received a write request for addr: %lu with "
-                        "wl: %s. This request maps to cacheBlocks[%d], "
-                        "aligned_addr: %lu, and wl_offset: %d.\n",
-                        __func__, addr, graphWorkload->printWorkListItem(wl),
-                        block_index, aligned_addr, wl_offset);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    DPRINTF(CoalesceEngine,  "%s: Received a write for WorkListItem: %s "
-                "with Addr: %lu.\n", __func__,
-                graphWorkload->printWorkListItem(wl), addr);
-    // Desing does not allow for write misses for now.
-    assert(cacheBlocks[block_index].addr == aligned_addr);
-    // cache state asserts
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask != 0);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    // respective bit in busyMask for wl is set.
-    assert((cacheBlocks[block_index].busyMask & (1 << wl_offset)) ==
-            (1 << wl_offset));
-
-    if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
-        cacheBlocks[block_index].needsWB |= true;
-        stats.numVertexWrites++;
-    }
-    cacheBlocks[block_index].items[wl_offset] = wl;
-    if (graphWorkload->applyCondition(cacheBlocks[block_index].items[wl_offset])) {
-        cacheBlocks[block_index].needsApply |= true;
-        cacheBlocks[block_index].needsWB |= true;
-    }
-
-    cacheBlocks[block_index].busyMask &= ~(1 << wl_offset);
-    cacheBlocks[block_index].lastChangedTick = curTick();
-    DPRINTF(CoalesceEngine,  "%s: Wrote to cacheBlocks[%d][%d] = %s.\n",
-                __func__, block_index, wl_offset,
-                graphWorkload->printWorkListItem(
-                    cacheBlocks[block_index].items[wl_offset]));
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                        block_index, cacheBlocks[block_index].to_string());
-
-    // TODO: Make this more general and programmable.
-    if ((cacheBlocks[block_index].busyMask == 0)) {
-        if (cacheBlocks[block_index].needsApply) {
-            cacheBlocks[block_index].pendingApply = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            applyQueue.push_back(block_index);
-            DPRINTF(CoalesceEngine, "%s: Added cacheBlocks[%d] to "
-                            "applyQueue.\n", __func__, block_index);
-            if ((!applyQueue.empty()) &&
-                (!nextPreWBApplyEvent.scheduled())) {
-                schedule(nextPreWBApplyEvent, nextCycle());
-            }
-        } else {
-            assert(MSHR.size() <= numMSHREntries);
-            // cache line has conflict.
-            if (MSHR.find(block_index) != MSHR.end()) {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                    "conflict.\n", __func__, block_index);
-                if (cacheBlocks[block_index].needsWB) {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] needs a write"
-                                            " back.\n", __func__, block_index);
-                    cacheBlocks[block_index].pendingWB = true;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextWriteBack(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                } else {
-                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] does not need"
-                                    " a write back.\n", __func__, block_index);
-                    Addr miss_addr = MSHR[block_index].front();
-                    Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-                    DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                        " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                        __func__, block_index, miss_addr, aligned_miss_addr);
-                    cacheBlocks[block_index].addr = aligned_miss_addr;
-                    cacheBlocks[block_index].valid = false;
-                    cacheBlocks[block_index].busyMask = 0;
-                    cacheBlocks[block_index].needsWB = false;
-                    cacheBlocks[block_index].needsApply = false;
-                    cacheBlocks[block_index].pendingData = true;
-                    cacheBlocks[block_index].pendingApply = false;
-                    cacheBlocks[block_index].pendingWB = false;
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
-                        [this] (int block_index, Tick schedule_tick) {
-                            processNextRead(block_index, schedule_tick);
-                        }, block_index, curTick());
-                    DPRINTF(CoalesceEngine, "%s: Pushed processNextRead "
-                                    "for input %d to memoryFunctionQueue.\n",
-                                                    __func__, block_index);
-                    if ((!nextMemoryEvent.pending()) &&
-                        (!nextMemoryEvent.scheduled())) {
-                        schedule(nextMemoryEvent, nextCycle());
-                    }
-                }
-            } else {
-                DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                        "idle state now.\n", __func__, block_index);
-            }
-        }
-    }
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-
-}
-
-void
-CoalesceEngine::processNextPreWBApplyEvent()
-{
-    int block_index = applyQueue.front();
-    DPRINTF(CoalesceEngine, "%s: Looking at the front of the applyQueue. "
-                "cacheBlock[%d] to be applied.\n", __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-            __func__, block_index, cacheBlocks[block_index].to_string());
-    assert(cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].needsApply);
-    assert(!cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    if (cacheBlocks[block_index].pendingApply) {
-        assert(cacheBlocks[block_index].busyMask == 0);
-        for (int index = 0; index < numElementsPerLine; index++) {
-            bool do_push = graphWorkload->preWBApply(cacheBlocks[block_index].items[index]);
-            if (do_push) {
-                int bit_index_base = getBitIndexBase(cacheBlocks[block_index].addr);
-                if (needsPush[bit_index_base + index] == 0) {
-                    needsPush[bit_index_base + index] = 1;
-                    _workCount++;
-                    activeBits.push_back(bit_index_base + index);
-                    if (!owner->running()) {
-                        owner->start();
-                    }
-                }
-            }
-        }
-        stats.bitvectorLength.sample(needsPush.count());
-
-        assert(cacheBlocks[block_index].needsWB);
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-
-        assert(MSHR.size() <= numMSHREntries);
-        if (MSHR.find(block_index) != MSHR.end()) {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has pending "
-                                "conflicts.\n", __func__, block_index);
-            cacheBlocks[block_index].pendingWB = true;
-            cacheBlocks[block_index].lastChangedTick = curTick();
-            memoryFunctionQueue.emplace_back(
-                [this] (int block_index, Tick schedule_tick) {
-                processNextWriteBack(block_index, schedule_tick);
-            }, block_index, curTick());
-            DPRINTF(CoalesceEngine, "%s: Pushed processNextWriteBack for input"
-                    " %d to memoryFunctionQueue.\n", __func__, block_index);
-            if ((!nextMemoryEvent.pending()) &&
-                (!nextMemoryEvent.scheduled())) {
-                schedule(nextMemoryEvent, nextCycle());
-            }
-        } else {
-            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is in "
-                    "idle state now.\n", __func__, block_index);
-        }
-        DPRINTF(CacheBlockState, "%s: cacheBlock[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        stats.numInvalidApplies++;
-    }
-
-    applyQueue.pop_front();
-    if ((!applyQueue.empty()) &&
-        (!nextPreWBApplyEvent.scheduled())) {
-        schedule(nextPreWBApplyEvent, nextCycle());
-    }
-
-    if (done()) {
-        owner->recvDoneSignal();
-    }
-}
-
-void
-CoalesceEngine::processNextMemoryEvent()
-{
-    if (memPort.blocked()) {
-        stats.numMemoryBlocks++;
-        nextMemoryEvent.sleep();
-        return;
-    }
-
-    DPRINTF(CoalesceEngine, "%s: Processing another "
-                        "memory function.\n", __func__);
-    std::function<void(int, Tick)> next_memory_function;
-    int next_memory_function_input;
-    Tick next_memory_function_tick;
-    std::tie(
-        next_memory_function,
-        next_memory_function_input,
-        next_memory_function_tick) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input, next_memory_function_tick);
-    memoryFunctionQueue.pop_front();
-    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
-                                                * 1e9 / getClockFrequency());
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
-
-    assert(!nextMemoryEvent.pending());
-    assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
-        schedule(nextMemoryEvent, nextCycle());
-    }
-}
-
-void
-CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be filled.\n",
-                                            __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n",
-        __func__, block_index, cacheBlocks[block_index].to_string());
-    // A cache block should not be touched while it's waiting for data.
-    // assert(schedule_tick == cacheBlocks[block_index].lastChangedTick);
-
-    if (cacheBlocks[block_index].lastChangedTick != schedule_tick) {
-        return;
-    }
-
-    assert(!cacheBlocks[block_index].valid);
-    assert(cacheBlocks[block_index].busyMask == 0);
-    assert(!cacheBlocks[block_index].needsWB);
-    assert(!cacheBlocks[block_index].needsApply);
-    assert(cacheBlocks[block_index].pendingData);
-    assert(!cacheBlocks[block_index].pendingApply);
-    assert(!cacheBlocks[block_index].pendingWB);
-
-    bool need_send_pkt = true;
-    for (auto wb = postPushWBQueue.begin(); wb != postPushWBQueue.end(); wb++)
-    {
-        PacketPtr wb_pkt = std::get<0>(*wb);
-        if (cacheBlocks[block_index].addr == wb_pkt->getAddr()) {
-            wb_pkt->writeDataToBlock(
-                (uint8_t*) cacheBlocks[block_index].items, peerMemoryAtomSize);
-            cacheBlocks[block_index].needsWB = true;
-            for (auto it = MSHR[block_index].begin(); it != MSHR[block_index].end();) {
-                Addr miss_addr = *it;
-                Addr aligned_miss_addr =
-                    roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-
-                if (aligned_miss_addr == cacheBlocks[block_index].addr) {
-                    int wl_offset = (miss_addr - aligned_miss_addr) / sizeof(WorkListItem);
-                    DPRINTF(CoalesceEngine,  "%s: Addr: %lu in the MSHR for "
-                                "cacheBlocks[%d] can be serviced with the received "
-                                "packet.\n",__func__, miss_addr, block_index);
-                    // TODO: Make this block of code into a function
-                    responseQueue.push_back(std::make_tuple(miss_addr,
-                            cacheBlocks[block_index].items[wl_offset], curTick()));
-                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
-                                "to responseQueue. responseQueue.size = %d.\n",
-                                __func__, miss_addr,
-                                graphWorkload->printWorkListItem(
-                                    cacheBlocks[block_index].items[wl_offset]),
-                                responseQueue.size());
-                    // TODO: Add a stat to count the number of WLItems that have been touched.
-                    cacheBlocks[block_index].busyMask |= (1 << wl_offset);
-                    cacheBlocks[block_index].lastChangedTick = curTick();
-                    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                                block_index, cacheBlocks[block_index].to_string());
-                    it = MSHR[block_index].erase(it);
-                } else {
-                    it++;
-                }
-            }
-            if (MSHR[block_index].empty()) {
-                MSHR.erase(block_index);
-            }
-
-            if ((!nextResponseEvent.scheduled()) &&
-                (!responseQueue.empty())) {
-                schedule(nextResponseEvent, nextCycle());
-            }
-            postPushWBQueue.erase(wb);
-            need_send_pkt = false;
-        }
-    }
-
-    if (pendingVertexPullReads.find(cacheBlocks[block_index].addr) !=
-        pendingVertexPullReads.end()) {
-        need_send_pkt = false;
-    }
-
-    if (need_send_pkt) {
-        PacketPtr pkt = createReadPacket(cacheBlocks[block_index].addr,
-                                        peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine,  "%s: Created a read packet. addr = %lu, "
-                "size = %d.\n", __func__, pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
-
-        if (pendingVertexPullReads.find(pkt->getAddr()) !=
-            pendingVertexPullReads.end()) {
-            stats.numDoubleMemReads++;
-        }
-    }
-}
-
-void
-CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
-{
-    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] to be written back.\n",
-                                                __func__, block_index);
-    DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                block_index, cacheBlocks[block_index].to_string());
-    if (schedule_tick == cacheBlocks[block_index].lastChangedTick) {
-        assert(cacheBlocks[block_index].valid);
-        assert(cacheBlocks[block_index].busyMask == 0);
-        assert(cacheBlocks[block_index].needsWB);
-        assert(!cacheBlocks[block_index].needsApply);
-        assert(!cacheBlocks[block_index].pendingData);
-        assert(!cacheBlocks[block_index].pendingApply);
-        assert(cacheBlocks[block_index].pendingWB);
-
-        // Why would we write it back if it does not have a conflict.
-        assert(MSHR.size() <= numMSHREntries);
-        assert(MSHR.find(block_index) != MSHR.end());
-
-        PacketPtr pkt = createWritePacket(
-                cacheBlocks[block_index].addr, peerMemoryAtomSize,
-                (uint8_t*) cacheBlocks[block_index].items);
-        DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
-                        "Addr: %lu, size = %d.\n", __func__,
-                        pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        // onTheFlyReqs++;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].pendingWB = false;
-
-        Addr miss_addr = MSHR[block_index].front();
-        Addr aligned_miss_addr =
-                        roundDown<Addr, size_t>(miss_addr, peerMemoryAtomSize);
-        DPRINTF(CoalesceEngine, "%s: First conflicting address for"
-                    " cacheBlocks[%d] is addr: %lu, aligned_addr: %lu.\n",
-                    __func__, block_index, miss_addr, aligned_miss_addr);
-
-        cacheBlocks[block_index].addr = aligned_miss_addr;
-        cacheBlocks[block_index].valid = false;
-        cacheBlocks[block_index].busyMask = 0;
-        cacheBlocks[block_index].needsWB = false;
-        cacheBlocks[block_index].needsApply = false;
-        cacheBlocks[block_index].pendingData = true;
-        cacheBlocks[block_index].pendingApply = false;
-        cacheBlocks[block_index].pendingWB = false;
-        cacheBlocks[block_index].lastChangedTick = curTick();
-        memoryFunctionQueue.emplace_back(
-            [this] (int block_index, Tick schedule_tick) {
-            processNextRead(block_index, schedule_tick);
-        }, block_index, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextRead for input"
-                " %d to memoryFunctionQueue.\n", __func__, block_index);
-        DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
-                    block_index, cacheBlocks[block_index].to_string());
-    } else {
-        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has been touched since a "
-                            "write back has been scheduled for it. Ignoring "
-                            "the current write back scheduled at tick %lu for "
-                            "the right function scheduled later.\n",
-                            __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
-    }
-}
-
-void
-CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
-{
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        postPushWBQueue.pop_front();
-    }
-}
-
-std::tuple<WorkLocation, Addr, int>
-CoalesceEngine::getOptimalPullAddr()
-{
-    int visited_bits = 0;
-    int num_intial_active_bits = activeBits.size();
-    while (visited_bits < num_intial_active_bits) {
-        int index = activeBits.front();
-        int base_index = roundDown<int, int>(index, numElementsPerLine);
-        int index_offset = index - base_index;
-        assert(needsPush[index] == 1);
-        assert(index_offset < numElementsPerLine);
-
-        Addr addr = getBlockAddrFromBitIndex(base_index);
-        int block_index = getBlockIndex(addr);
-        if (pendingVertexPullReads.find(addr) != pendingVertexPullReads.end())
-        {
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            activeBits.pop_front();
-            return std::make_tuple(
-                                WorkLocation::PENDING_READ, addr, index_offset);
-        } else {
-            // Only if it is in cache and it is in idle state.
-            if ((cacheBlocks[block_index].addr == addr) &&
-                (cacheBlocks[block_index].valid) &&
-                (cacheBlocks[block_index].busyMask == 0) &&
-                (!cacheBlocks[block_index].pendingApply) &&
-                (!cacheBlocks[block_index].pendingWB)) {
-                assert(!cacheBlocks[block_index].needsApply);
-                assert(!cacheBlocks[block_index].pendingData);
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_CACHE, block_index, index_offset);
-            // Otherwise if it is in memory
-            } else if ((cacheBlocks[block_index].addr != addr)) {
-                activeBits.pop_front();
-                return std::make_tuple(
-                            WorkLocation::IN_MEMORY, addr, index_offset);
-            }
-        }
-        activeBits.pop_front();
-        activeBits.push_back(index);
-        visited_bits++;
-    }
-
-    return std::make_tuple(WorkLocation::GARBAGE, 0, 0);
-}
-
-void
-CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
-{
-    WorkLocation bit_status;
-    Addr location;
-    int offset;
-
-    std::tie(bit_status, location, offset) = getOptimalPullAddr();
-
-    if (bit_status != WorkLocation::GARBAGE) {
-        if (bit_status == WorkLocation::PENDING_READ) {
-            // renaming the outputs to thier local names.
-            Addr addr = location;
-            int index_offset = offset;
-
-            uint64_t send_mask = pendingVertexPullReads[addr];
-            uint64_t vertex_send_mask = send_mask & (1 << index_offset);
-            assert(vertex_send_mask == 0);
-            send_mask |= (1 << index_offset);
-            pendingVertexPullReads[addr] = send_mask;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_CACHE) {
-            // renaming the outputs to their local names.
-            int block_index = (int) location;
-            int wl_offset = offset;
-
-            Addr addr = cacheBlocks[block_index].addr;
-            Addr vertex_addr = addr + (wl_offset * sizeof(WorkListItem));
-            int slice_base_index = getBitIndexBase(addr);
-
-            needsPush[slice_base_index + wl_offset] = 0;
-            _workCount--;
-
-            uint32_t delta;
-            bool do_push, do_wb;
-            std::tie(delta, do_push, do_wb) = graphWorkload->prePushApply(
-                                    cacheBlocks[block_index].items[wl_offset]);
-            cacheBlocks[block_index].needsWB |= do_wb;
-            if (do_push) {
-                owner->recvVertexPush(vertex_addr, delta,
-                        cacheBlocks[block_index].items[wl_offset].edgeIndex,
-                        cacheBlocks[block_index].items[wl_offset].degree);
-            } else {
-                DPRINTF(CoalesceEngine, "%s: Fuck!.\n", __func__);
-                owner->recvPrevPullCorrection();
-            }
-            stats.verticesPushed++;
-            stats.lastVertexPushTime = curTick() - stats.lastResetTick;
-            numPullsReceived--;
-        }
-        if (bit_status == WorkLocation::IN_MEMORY) {
-            if (postPushWBQueue.size() < (postPushWBQueueSize - maxPotentialPostPushWB)) {
-                Addr addr = location;
-                int index_offset = offset;
-                uint64_t send_mask = (1 << index_offset);
-                assert(pendingVertexPullReads.find(addr) == pendingVertexPullReads.end());
-                PacketPtr pkt = createReadPacket(addr, peerMemoryAtomSize);
-                SenderState* sender_state = new SenderState(true);
-                pkt->pushSenderState(sender_state);
-                memPort.sendPacket(pkt);
-                onTheFlyReqs++;
-                maxPotentialPostPushWB++;
-                pendingVertexPullReads[addr] = send_mask;
-                numPullsReceived--;
-            }
-        }
-    }
-
-    stats.bitvectorSearchStatus[bit_status]++;
-
-    if (numPullsReceived > 0) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        DPRINTF(CoalesceEngine, "%s: Pushed processNextVertexPull with input "
-                                    "0 to memoryFunctionQueue.\n", __func__);
-    }
-}
-
-void
-CoalesceEngine::recvMemRetry()
-{
-    DPRINTF(CoalesceEngine, "%s: Received a MemRetry.\n", __func__);
-
-    if (!nextMemoryEvent.pending()) {
-        DPRINTF(CoalesceEngine, "%s: Not pending MemRerty.\n", __func__);
-        return;
-    }
-    assert(!nextMemoryEvent.scheduled());
-    nextMemoryEvent.wake();
-    schedule(nextMemoryEvent, nextCycle());
-}
-
-void
-CoalesceEngine::recvVertexPull()
-{
-    bool should_schedule = (numPullsReceived == 0);
-    numPullsReceived++;
-
-    stats.verticesPulled++;
-    stats.lastVertexPullTime = curTick() - stats.lastResetTick;
-    if (should_schedule) {
-        memoryFunctionQueue.emplace_back(
-            [this] (int slice_base, Tick schedule_tick) {
-            processNextVertexPull(slice_base, schedule_tick);
-        }, 0, curTick());
-        if ((!nextMemoryEvent.pending()) &&
-            (!nextMemoryEvent.scheduled())) {
-            schedule(nextMemoryEvent, nextCycle());
-        }
-    }
-}
-
-CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
-    : statistics::Group(&_coalesce),
-    coalesce(_coalesce),
-    lastResetTick(0),
-    ADD_STAT(numVertexReads, statistics::units::Count::get(),
-             "Number of memory vertecies read from cache."),
-    ADD_STAT(numVertexWrites, statistics::units::Count::get(),
-             "Number of memory vertecies written to cache."),
-    ADD_STAT(readHits, statistics::units::Count::get(),
-             "Number of cache hits."),
-    ADD_STAT(readMisses, statistics::units::Count::get(),
-             "Number of cache misses."),
-    ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
-             "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
-    ADD_STAT(mshrTargetShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by target shortage."),
-    ADD_STAT(responsePortShortage, statistics::units::Count::get(),
-             "Number of times a response has been "
-             "delayed because of port shortage. "),
-    ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
-             "Number of times memory bandwidth was not available."),
-    ADD_STAT(numDoubleMemReads, statistics::units::Count::get(),
-             "Number of times a memory block has been read twice. "
-             "Once for push and once to populate the cache."),
-    ADD_STAT(verticesPulled, statistics::units::Count::get(),
-             "Number of times a pull request has been sent by PushEngine."),
-    ADD_STAT(verticesPushed, statistics::units::Count::get(),
-             "Number of times a vertex has been pushed to the PushEngine"),
-    ADD_STAT(lastVertexPullTime, statistics::units::Tick::get(),
-             "Time of the last pull request. (Relative to reset_stats)"),
-    ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
-             "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidApplies, statistics::units::Count::get(),
-             "Number of times a line has become busy"
-             " while waiting to be applied."),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
-    ADD_STAT(bitvectorSearchStatus, statistics::units::Count::get(),
-             "Distribution for the location of vertex searches."),
-    ADD_STAT(hitRate, statistics::units::Ratio::get(),
-             "Hit rate in the cache."),
-    ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which pull requests arrive."),
-    ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
-                                            statistics::units::Second>::get(),
-             "Rate at which vertices are pushed."),
-    ADD_STAT(mshrEntryLength, statistics::units::Count::get(),
-             "Histogram on the length of the mshr entries."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
-             "Histogram of the response latency to WLEngine. (ns)"),
-    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
-             "Histogram of the latency of processing a memory function.")
-{
-}
-
-void
-CoalesceEngine::CoalesceStats::regStats()
-{
-    using namespace statistics;
-
-    bitvectorSearchStatus.init(NUM_STATUS);
-    bitvectorSearchStatus.subname(0, "PENDING_READ");
-    bitvectorSearchStatus.subname(1, "IN_CACHE");
-    bitvectorSearchStatus.subname(2, "IN_MEMORY");
-    bitvectorSearchStatus.subname(3, "GARBAGE");
-
-    hitRate = (readHits + readHitUnderMisses) /
-                (readHits + readHitUnderMisses + readMisses);
-
-    vertexPullBW = (verticesPulled * getClockFrequency()) / lastVertexPullTime;
-
-    vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
-
-    mshrEntryLength.init(coalesce.params().num_tgts_per_mshr);
-    bitvectorLength.init(64);
-    responseQueueLatency.init(64);
-    memoryFunctionLatency.init(64);
-}
-
-void
-CoalesceEngine::CoalesceStats::resetStats()
-{
-    statistics::Group::resetStats();
-
-    lastResetTick = curTick();
-}
-
-} // namespace gem5
diff --git a/src/accl/graph/sega/coalesce_engine_bak.hh b/src/accl/graph/sega/coalesce_engine_bak.hh
deleted file mode 100644
index 0787a334c1..0000000000
--- a/src/accl/graph/sega/coalesce_engine_bak.hh
+++ /dev/null
@@ -1,218 +0,0 @@
-/*
- * Copyright (c) 2020 The Regents of the University of California.
- * All rights reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions are
- * met: redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer;
- * redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in the
- * documentation and/or other materials provided with the distribution;
- * neither the name of the copyright holders nor the names of its
- * contributors may be used to endorse or promote products derived from
- * this software without specific prior written permission.
- *
- * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
- * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
- * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
- * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
- * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
- * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
- * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
- * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
- * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
- */
-
-#ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-#define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
-
-#include <bitset>
-
-#include "accl/graph/base/data_structs.hh"
-#include "accl/graph/base/graph_workload.hh"
-#include "accl/graph/sega/base_memory_engine.hh"
-#include "base/cprintf.hh"
-#include "base/statistics.hh"
-#include "params/CoalesceEngine.hh"
-
-
-
-namespace gem5
-{
-
-enum WorkLocation
-{
-    PENDING_READ,
-    IN_CACHE,
-    IN_MEMORY,
-    GARBAGE,
-    NUM_STATUS
-};
-
-class MPU;
-
-class CoalesceEngine : public BaseMemoryEngine
-{
-  private:
-    struct Block
-    {
-        WorkListItem* items;
-        Addr addr;
-        uint64_t busyMask;
-        bool valid;
-        bool needsApply;
-        bool needsWB;
-        bool pendingData;
-        bool pendingApply;
-        bool pendingWB;
-        Tick lastChangedTick;
-        // TODO: This might be useful in the future
-        // Tick lastWLWriteTick;
-        Block() {}
-        Block(int num_elements):
-          addr(-1),
-          busyMask(0),
-          valid(false),
-          needsApply(false),
-          needsWB(false),
-          pendingData(false),
-          pendingApply(false),
-          pendingWB(false),
-          lastChangedTick(0),
-        {
-          items = new WorkListItem [num_elements];
-        }
-
-        std::string to_string() {
-            return csprintf("CacheBlock{addr: %lu, busyMask: %lu, valid: %s, "
-                "needsApply: %s, needsWB: %s, pendingData: %s, "
-                "pendingApply: %s, pendingWB: %s, lastChangedTick: %lu}",
-                addr, busyMask, valid ? "true" : "false",
-                needsApply ? "true" : "false", needsWB ? "true" : "false",
-                pendingData ? "true" : "false", pendingApply ? "true" : "false",
-                pendingWB ? "true" : "false", lastChangedTick);
-        }
-    };
-
-    struct SenderState : public Packet::SenderState
-    {
-      bool isRetry;
-      SenderState(bool is_retry): isRetry(is_retry) {}
-    };
-    MPU* owner;
-    GraphWorkload* graphWorkload;
-
-    int numLines;
-    int numElementsPerLine;
-    Block* cacheBlocks;
-
-    int onTheFlyReqs;
-    int numMSHREntries;
-    int numTgtsPerMSHR;
-    std::unordered_map<int, std::vector<Addr>> MSHR;
-    int maxRespPerCycle;
-    std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
-
-    int _workCount;
-    int numPullsReceived;
-    UniqueFIFO<int> applyQueue;
-    std::bitset<MAX_BITVECTOR_SIZE> needsPush;
-    std::deque<int> activeBits;
-    int postPushWBQueueSize;
-    std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
-
-    int getBlockIndex(Addr addr);
-    int getBitIndexBase(Addr addr);
-    Addr getBlockAddrFromBitIndex(int index);
-    std::tuple<WorkLocation, Addr, int> getOptimalPullAddr();
-
-    int maxPotentialPostPushWB;
-    // A map from addr to sendMask. sendMask determines which bytes to
-    // send for push when getting the read response from memory.
-    std::unordered_map<Addr, uint64_t> pendingVertexPullReads;
-
-    MemoryEvent nextMemoryEvent;
-    void processNextMemoryEvent();
-    void processNextRead(int block_index, Tick schedule_tick);
-    void processNextWriteBack(int block_index, Tick schedule_tick);
-    void processNextVertexPull(int ignore, Tick schedule_tick);
-    void processNextPostPushWB(int ignore, Tick schedule_tick);
-    std::deque<std::tuple<
-        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
-
-    EventFunctionWrapper nextResponseEvent;
-    void processNextResponseEvent();
-
-    EventFunctionWrapper nextPreWBApplyEvent;
-    void processNextPreWBApplyEvent();
-
-    struct CoalesceStats : public statistics::Group
-    {
-        CoalesceStats(CoalesceEngine &coalesce);
-
-        virtual void regStats() override;
-
-        virtual void resetStats() override;
-
-        CoalesceEngine &coalesce;
-
-        Tick lastResetTick;
-
-        statistics::Scalar numVertexReads;
-        statistics::Scalar numVertexWrites;
-        statistics::Scalar readHits;
-        statistics::Scalar readMisses;
-        statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
-        statistics::Scalar mshrTargetShortage;
-        statistics::Scalar responsePortShortage;
-        statistics::Scalar numMemoryBlocks;
-        statistics::Scalar numDoubleMemReads;
-        statistics::Scalar verticesPulled;
-        statistics::Scalar verticesPushed;
-        statistics::Scalar lastVertexPullTime;
-        statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidApplies;
-        statistics::Scalar numInvalidWriteBacks;
-
-        statistics::Vector bitvectorSearchStatus;
-
-        statistics::Formula hitRate;
-        statistics::Formula vertexPullBW;
-        statistics::Formula vertexPushBW;
-
-        statistics::Histogram mshrEntryLength;
-        statistics::Histogram bitvectorLength;
-        statistics::Histogram responseQueueLatency;
-        statistics::Histogram memoryFunctionLatency;
-    };
-
-    CoalesceStats stats;
-
-  protected:
-    virtual void recvMemRetry() override;
-    virtual bool handleMemResp(PacketPtr pkt) override;
-
-  public:
-    PARAMS(CoalesceEngine);
-    CoalesceEngine(const Params &params);
-    void registerMPU(MPU* mpu);
-
-    void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
-
-    bool recvWLRead(Addr addr);
-    void recvWLWrite(Addr addr, WorkListItem wl);
-
-    int workCount() { return _workCount; }
-    void recvVertexPull();
-
-    bool done();
-};
-
-}
-
-#endif // __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
new file mode 100644
index 0000000000..8c9d223178
--- /dev/null
+++ b/src/accl/graph/sega/enums.cc
@@ -0,0 +1,57 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/enums.hh"
+
+namespace gem5
+{
+
+const char* cacheStateStrings[NUM_CACHE_STATE] = {
+    "INVALID",
+    "PENDING_DATA",
+    "BUSY",
+    "IDLE",
+    "PENDING_WB",
+    "LOCKED_FOR_APPLY"
+};
+
+
+const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
+{
+    "ACCEPT",
+    "REJECT_ROLL",
+    "REJECT_NO_ROLL"
+};
+
+const char* readDestinationStrings[NUM_READ_DESTINATION] =
+{
+    "READ_FOR_CACHE",
+    "READ_FOR_PUSH"
+};
+
+} // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
new file mode 100644
index 0000000000..e7a8f84452
--- /dev/null
+++ b/src/accl/graph/sega/enums.hh
@@ -0,0 +1,66 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ENUMS_HH__
+#define __ACCL_GRAPH_SEGA_ENUMS_HH__
+
+namespace gem5
+{
+
+enum CacheState
+{
+    INVALID,
+    PENDING_DATA,
+    BUSY,
+    IDLE,
+    PENDING_WB,
+    LOCKED_FOR_APPLY,
+    NUM_CACHE_STATE
+};
+extern const char* cacheStateStrings[NUM_CACHE_STATE];
+
+enum ReadReturnStatus
+{
+    ACCEPT,
+    REJECT_ROLL,
+    REJECT_NO_ROLL,
+    NUM_READ_RETURN_STATUS
+};
+extern const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS];
+
+enum ReadDestination
+{
+    READ_FOR_CACHE,
+    READ_FOR_PUSH,
+    NUM_READ_DESTINATION
+};
+extern const char* readDestinationStrings[NUM_READ_DESTINATION];
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index b30060238d..f661bd68a6 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -87,12 +87,6 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
     pushEngine->recvVertexPush(addr, delta, edge_index, degree);
 }
 
-void
-MPU::recvPrevPullCorrection()
-{
-    pushEngine->recvPrevPullCorrection();
-}
-
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 8f3b29f603..ad18a0d5a5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -34,6 +34,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/sega/coalesce_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/push_engine.hh"
 #include "accl/graph/sega/wl_engine.hh"
 #include "base/addr_range.hh"
@@ -64,10 +65,12 @@ class MPU : public SimObject
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
+    void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+
     bool handleIncomingUpdate(PacketPtr pkt);
 
     void handleIncomingWL(Addr addr, WorkListItem wl);
-    bool recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
+    ReadReturnStatus recvWLRead(Addr addr) { return coalesceEngine->recvWLRead(addr); }
     void recvWLWrite(Addr addr, WorkListItem wl);
     void recvWorkload(GraphWorkload* Workload);
 
@@ -77,7 +80,6 @@ class MPU : public SimObject
     void start() { return pushEngine->start(); }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvDoneSignal();
     bool done();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 07f37a28dc..a17991e335 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -155,13 +155,13 @@ void
 PushEngine::start()
 {
     assert(!_running);
-    assert(!nextVertexPullEvent.scheduled());
+    // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
     stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
-    if (vertexSpace()) {
+    if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 }
@@ -169,17 +169,16 @@ PushEngine::start()
 void
 PushEngine::processNextVertexPullEvent()
 {
-    // TODO: change edgePointerQueueSize
-    numPendingPulls++;
-    owner->recvVertexPull();
-
-    if (!workLeft()) {
+    if (workLeft()) {
+        numPendingPulls++;
+        owner->recvVertexPull();
+        if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+            schedule(nextVertexPullEvent, nextCycle());
+        }
+    } else {
         _running = false;
         lastIdleEntranceTick = curTick();
-    }
-
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
+        DPRINTF(PushEngine, "%s: In idle state now.\n", __func__);
     }
 }
 
@@ -197,9 +196,9 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
-
     numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
@@ -209,16 +208,6 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
     }
 }
 
-void
-PushEngine::recvPrevPullCorrection()
-{
-    assert(numPendingPulls > 0);
-    numPendingPulls--;
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
-        schedule(nextVertexPullEvent, nextCycle());
-    }
-}
-
 void
 PushEngine::processNextMemoryReadEvent()
 {
@@ -255,7 +244,7 @@ PushEngine::processNextMemoryReadEvent()
         }
     }
 
-    if (workLeft() && vertexSpace() && (!nextVertexPullEvent.scheduled())) {
+    if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
         schedule(nextVertexPullEvent, nextCycle());
     }
 
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2e1de25390..08cceb14f0 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -32,6 +32,7 @@
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -199,7 +200,6 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
-    void recvPrevPullCorrection();
 
     void recvReqRetry();
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index a698f2cc0a..2b305e1557 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -134,7 +134,7 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::checkRetryReq()
 {
-    for (int i = 0; i < inPorts.size(); ++i) {
+    for (int i = 0; i < inPorts.size(); i++) {
         inPorts[i].checkRetryReq();
     }
 }
@@ -191,12 +191,8 @@ WLEngine::processNextReadEvent()
         if (registerFile.size() < registerFileSize) {
             DPRINTF(WLEngine, "%s: There are free registers available in the "
                                             "registerFile.\n", __func__);
-            // TODO: It might be a good idea for WLEngine to act differently
-            // on cache rejects. As a first step the cache should not just
-            // return a boolean value. It should return an integer/enum
-            // to tell WLEngine why it rejected the read request. Their might
-            // be things that WLEngine can do to fix head of the line blocking.
-            if (owner->recvWLRead(update_addr)) {
+            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+            if (read_status == ReadReturnStatus::ACCEPT) {
                 DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
                             "request to addr: %lu.\n", __func__, update_addr);
                 registerFile[update_addr] = update_value;
@@ -209,7 +205,8 @@ WLEngine::processNextReadEvent()
                         "registerFileSize = %d.\n", __func__, update_addr,
                         update_value, registerFile.size(), registerFileSize);
                 updateQueue.pop_front();
-                stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+                stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
@@ -220,6 +217,17 @@ WLEngine::processNextReadEvent()
                             update_value, updateQueue.size(), updateQueueSize);
                 checkRetryReq();
                 vertexReadTime[update_addr] = curTick();
+            } else {
+                if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                    updateQueue.pop_front();
+                    updateQueue.emplace_back(
+                                        update_addr, update_value, enter_tick);
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Rolling the update.\n", __func__);
+                } else {
+                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                    "Not rolling the update.\n", __func__);
+                }
             }
         } else {
             DPRINTF(WLEngine, "%s: There are no free registers "
@@ -227,7 +235,6 @@ WLEngine::processNextReadEvent()
             stats.registerShortage++;
         }
     } else {
-        // TODO: Generalize this to reduce function rather than just min
         DPRINTF(WLEngine,  "%s: A register has already been allocated for "
                     "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
                 __func__, update_addr, update_addr, registerFile[update_addr]);
@@ -238,7 +245,8 @@ WLEngine::processNextReadEvent()
                     update_value, update_addr, registerFile[update_addr]);
         stats.registerFileCoalesce++;
         updateQueue.pop_front();
-        stats.updateQueueLatency.sample((curTick() - enter_tick) * 1e9 / getClockFrequency());
+        stats.updateQueueLatency.sample(
+                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
         DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index f442d6060e..b5ad3d9040 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -35,6 +35,7 @@
 #include "accl/graph/base/base_reduce_engine.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/base/data_structs.hh"
+#include "accl/graph/sega/enums.hh"
 #include "base/statistics.hh"
 #include "params/WLEngine.hh"
 
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
new file mode 100644
index 0000000000..4102e29cd3
--- /dev/null
+++ b/src/accl/graph/sega/work_directory.hh
@@ -0,0 +1,212 @@
+/*
+ * Copyright (c) 2020 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+#define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
+
+#include "base/addr_range.hh"
+#include "base/types.hh"
+
+namespace gem5
+{
+
+class WorkDirectory
+{
+  public:
+    virtual void activate(Addr atom_addr) = 0;
+    virtual void deactivate(Addr atom_addr) = 0;
+    virtual Addr getNextWork() = 0;
+
+    virtual int workCount() = 0;
+    bool empty() { return workCount() == 0; }
+
+    virtual void setLastAtomAddr(Addr atom_addr) = 0;
+};
+
+class PopCountDirectory: public WorkDirectory
+{
+  private:
+    AddrRange memoryRange;
+
+    int numAtomsPerBlock;
+    int memoryAtomSize;
+    int blockSize;
+
+    uint32_t _workCount;
+
+    int numCounters;
+    int lastCounterIndex;
+    uint32_t* popCount;
+
+    uint32_t currentIndex;
+    uint32_t currentCounter;
+
+    int getIndexFromAtomAddr(Addr atom_addr)
+    {
+        assert((atom_addr % memoryAtomSize) == 0);
+        Addr trimmed_addr = memoryRange.removeIntlvBits(atom_addr);
+        int index = (int) (trimmed_addr / blockSize);
+        return index;
+    }
+
+    Addr getAtomAddrFromIndex(int block_index, int atom_index)
+    {
+        Addr block_addr = block_index * blockSize;
+        Addr trimmed_addr = block_addr + atom_index * memoryAtomSize;
+        return memoryRange.addIntlvBits(trimmed_addr);
+    }
+
+  public:
+    PopCountDirectory(AddrRange mem_range, int atoms_per_block, int atom_size):
+        WorkDirectory(),
+        memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
+        memoryAtomSize(atom_size), _workCount(0),
+        currentIndex(0), currentCounter(0)
+    {
+        blockSize = numAtomsPerBlock * memoryAtomSize;
+        int numCounters = (int) (memoryRange.size() / blockSize);
+        lastCounterIndex = numCounters - 1;
+        popCount = new uint32_t [numCounters];
+        for (int index = 0; index < numCounters; index++) {
+            popCount[index] = 0;
+        }
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is not** tracking the the atom with atom_addr
+    virtual void activate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]++;
+        _workCount++;
+        assert(popCount[index] > prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    // CAUTION: This should only be called when the work
+    // directory **is** tracking the the atom with atom_addr
+    virtual void deactivate(Addr atom_addr)
+    {
+        int index = getIndexFromAtomAddr(atom_addr);
+        uint32_t prev_count = popCount[index];
+        popCount[index]--;
+        _workCount--;
+        assert(popCount[index] < prev_count);
+        assert(popCount[index] <= numAtomsPerBlock);
+    }
+
+    virtual int workCount() { return _workCount; }
+
+    void setLastAtomAddr(Addr atom_addr)
+    {
+        lastCounterIndex = getIndexFromAtomAddr(atom_addr);
+    }
+
+    // CAUTION: If this function returns an addr that
+    // is in the cache, that addr should be ignored.
+    // CAUTION: The receiver should track the last n
+    // addresses that this WorkDirectory has generated.
+    // where n is equal to the size of the entry holding
+    // reads generated by this WorkDirectory. In case
+    // the WorkDirectory generates a repeated address
+    // it should be ignored.
+    // FIXME: This should return garbage if it can't find anything.
+    // virtual Addr getNextWork()
+    // {
+    //     if ((currentCounter == numAtomsPerBlock) ||
+    //         (popCount[currentIndex] == 0)) {
+    //         int prev_index = currentIndex;
+    //         while (true) {
+    //             currentIndex++;
+    //             // NOTE: this is an optimization.
+    //             // lastCounterIndex tracks the last blockOfAtom that
+    //             // has vertices. By default it is set to numCounters - 1.
+    //             // However, it might not be necessary to track all the
+    //             // numCounters counters. e.g. If this WorkDirectory is tracking
+    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
+    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
+    //             // However, if we need 8 Mi atoms to store all our vertices,
+    //             // the second half of the counters would not be used at all
+    //             // (512 MiB hold 16 Mi atoms and we're only using half).
+    //             if (currentIndex > lastCounterIndex) {
+    //                 currentIndex = 0;
+    //             }
+    //             if (prev_index == currentIndex) {
+    //                 // NOTE: If we have reached the same index as before,
+    //                 // we need to decrement the currentCounter to generate
+    //                 // a repeatative address. This way the receiver can detect
+    //                 // the uselessness of the generated address and ignore it
+    //                 currentCounter--;
+    //                 break;
+    //             }
+    //             if (popCount[currentIndex] > 0) {
+    //                 currentCounter = 0;
+    //                 break;
+    //             }
+    //         }
+    //     }
+    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+    //     currentCounter++;
+
+    //     return ret_addr;
+    // }
+
+    virtual Addr getNextWork()
+    {
+        if ((currentCounter == numAtomsPerBlock) ||
+            (popCount[currentIndex] == 0)) {
+            int other_count = _workCount - popCount[currentIndex];
+            if (other_count == 0) {
+                currentCounter = 0;
+            } else {
+                int prev_index = currentIndex;
+                while (true) {
+                    currentIndex++;
+                    if (currentIndex > lastCounterIndex) {
+                        currentIndex = 0;
+                    }
+                    if (currentIndex == prev_index) {
+                        break;
+                    }
+                    if (popCount[currentIndex] > 0) {
+                        break;
+                    }
+                }
+                currentCounter = 0;
+            }
+        }
+        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        currentCounter++;
+        return ret_addr;
+    }
+};
+
+} // namespace gem5
+
+#endif // __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index c65d68a5a7..3cbacef800 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -212,7 +212,7 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
     for (int cnt = 0; cnt < pkt_count; ++cnt) {
         unsigned size = std::min((addr | (burst_size - 1)) + 1,
                         base_addr + pkt->getSize()) - addr;
-        stats.readPktSize[ceilLog2(size)]++;
+        // stats.readPktSize[ceilLog2(size)]++;
         stats.readBursts++;
         stats.requestorReadAccesses[pkt->requestorId()]++;
 

From c4fc96e2146aeec5e7a978c11dfd4e5b36a7a67b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 7 Nov 2022 19:53:35 -0800
Subject: [PATCH 212/287] Adding new stats.

---
 configs/accl/sega.py                   | 12 ++++--
 src/accl/graph/sega/CoalesceEngine.py  |  2 -
 src/accl/graph/sega/coalesce_engine.cc | 51 ++++++++++++--------------
 src/accl/graph/sega/coalesce_engine.hh |  4 +-
 src/accl/graph/sega/push_engine.cc     | 16 ++++++--
 src/accl/graph/sega/push_engine.hh     |  5 ++-
 6 files changed, 51 insertions(+), 39 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 54f22b1377..7baa27fd5e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -53,7 +53,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
-            num_mshr_entry=64,
             max_resp_per_cycle=8,
             active_buffer_size = 64,
             post_push_wb_queue_size=64,
@@ -61,7 +60,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=512,
+            resp_queue_size=4096,
             update_queue_size=32,
         )
 
@@ -74,7 +73,11 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-
+        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
+        #                                 latency_var="0ns",
+        #                                 bandwidth="18GiB/s",
+        #                                 range=AddrRange(edge_memory_size),
+        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
@@ -105,6 +108,9 @@ def set_vertex_pch_bit(self, pch_bit):
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
+    # def set_edge_image(self, edge_image):
+    #     self.edge_mem_ctrl.image_file = edge_image
+
 
 
 class SEGA(System):
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index a447dedc3d..76e7d262e8 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -37,8 +37,6 @@ class CoalesceEngine(BaseMemoryEngine):
 
     cache_size = Param.MemorySize("Size of the internal SRAM array.")
 
-    num_mshr_entry = Param.Int("Number of MSHR entries.")
-
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 0aa61345f7..d7cf173097 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,7 +46,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), numMSHREntries(params.num_mshr_entry),
+    onTheFlyReqs(0),
     maxRespPerCycle(params.max_resp_per_cycle),
     pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
     activeBufferSize(params.active_buffer_size),
@@ -227,7 +227,6 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(cacheBlocks[block_index].busyMask == 0);
         assert(!cacheBlocks[block_index].dirty);
 
-        assert(MSHR.size() <= numMSHREntries);
         assert(MSHR.find(block_index) != MSHR.end());
         MSHR[block_index].push_back(addr);
         DPRINTF(CoalesceEngine,  "%s: Added Addr: %lu to MSHR "
@@ -239,7 +238,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     } else {
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
-        assert(MSHR.size() <= numMSHREntries);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
 
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
@@ -284,29 +282,26 @@ CoalesceEngine::recvWLRead(Addr addr)
         } else {
             // cold miss
             assert(MSHR.find(block_index) == MSHR.end());
-            if (MSHR.size() < numMSHREntries) {
-                cacheBlocks[block_index].addr = aligned_addr;
-                cacheBlocks[block_index].busyMask = 0;
-                cacheBlocks[block_index].valid = false;
-                cacheBlocks[block_index].dirty = false;
-                cacheBlocks[block_index].hasConflict = false;
-                cacheBlocks[block_index].state = CacheState::PENDING_DATA;
-                cacheBlocks[block_index].lastChangedTick = curTick();
+            cacheBlocks[block_index].addr = aligned_addr;
+            cacheBlocks[block_index].busyMask = 0;
+            cacheBlocks[block_index].valid = false;
+            cacheBlocks[block_index].dirty = false;
+            cacheBlocks[block_index].hasConflict = false;
+            cacheBlocks[block_index].state = CacheState::PENDING_DATA;
+            cacheBlocks[block_index].lastChangedTick = curTick();
 
-                MSHR[block_index].push_back(addr);
-                memoryFunctionQueue.emplace_back(
-                    [this] (int block_index, Tick schedule_tick) {
-                        processNextRead(block_index, schedule_tick);
-                    }, block_index, curTick());
-                if ((!nextMemoryEvent.pending()) &&
-                    (!nextMemoryEvent.scheduled())) {
-                    schedule(nextMemoryEvent, nextCycle());
-                }
-                return ReadReturnStatus::ACCEPT;
-            } else {
-                return ReadReturnStatus::REJECT_ROLL;
+            MSHR[block_index].push_back(addr);
+            memoryFunctionQueue.emplace_back(
+                [this] (int block_index, Tick schedule_tick) {
+                    processNextRead(block_index, schedule_tick);
+                }, block_index, curTick());
+            if ((!nextMemoryEvent.pending()) &&
+                (!nextMemoryEvent.scheduled())) {
+                schedule(nextMemoryEvent, nextCycle());
             }
+            return ReadReturnStatus::ACCEPT;
         }
+        stats.readMisses++;
     }
 }
 
@@ -939,6 +934,8 @@ CoalesceEngine::processNextApplyEvent()
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
+                stats.verticesPushed++;
+                stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
         }
         pkt->deleteData();
@@ -986,6 +983,8 @@ CoalesceEngine::processNextApplyEvent()
                             cacheBlocks[block_index].items[index].edgeIndex,
                             cacheBlocks[block_index].items[index].degree);
                         pullsReceived--;
+                        stats.verticesPushed++;
+                        stats.lastVertexPushTime = curTick() - stats.lastResetTick;
                     }
                 }
 
@@ -1057,8 +1056,6 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
-    ADD_STAT(mshrEntryShortage, statistics::units::Count::get(),
-             "Number of cache rejections caused by entry shortage."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
@@ -1082,7 +1079,7 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(bitvectorLength, statistics::units::Count::get(),
+    ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
@@ -1103,7 +1100,7 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    bitvectorLength.init(64);
+    frontierSize.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c457b214f9..f87e0027a2 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -106,7 +106,6 @@ class CoalesceEngine : public BaseMemoryEngine
     Block* cacheBlocks;
 
     int onTheFlyReqs;
-    int numMSHREntries;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
     // Response route to WLEngine
@@ -167,7 +166,6 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
-        statistics::Scalar mshrEntryShortage;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;
@@ -180,7 +178,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram bitvectorLength;
+        statistics::Histogram frontierSize;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a17991e335..09f29a43e4 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -158,7 +158,7 @@ PushEngine::start()
     // assert(!nextVertexPullEvent.scheduled());
 
     _running = true;
-    stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
+    // stats.numIdleCycles += ticksToCycles(curTick() - lastIdleEntranceTick);
     // NOTE: We might have to check for size availability here.
     assert(workLeft());
     if (vertexSpace() && !nextVertexPullEvent.scheduled()) {
@@ -196,6 +196,7 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
                             sizeof(Edge), peerMemoryAtomSize);
 
     edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
     numPendingPulls--;
 
     if (vertexSpace() && (!nextVertexPullEvent.scheduled())) {
@@ -239,6 +240,7 @@ PushEngine::processNextMemoryReadEvent()
             stats.edgePointerQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
             edgePointerQueue.pop_front();
+            stats.edgePointerQueueLength.sample(edgePointerQueue.size());
             DPRINTF(PushEngine, "%s: Popped curr_info from edgePointerQueue. "
             "edgePointerQueue.size() = %u.\n", __func__, edgePointerQueue.size());
         }
@@ -282,6 +284,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
         MetaEdge meta_edge(
                     push_info.src, edge_dst, edge_weight, push_info.value);
         metaEdgeQueue.emplace_back(meta_edge, curTick());
+        stats.edgeQueueLength.sample(metaEdgeQueue.size());
     }
     stats.numWastefulEdgesRead +=
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
@@ -320,6 +323,7 @@ PushEngine::processNextPropagateEvent()
             stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
+            stats.edgeQueueLength.sample(metaEdgeQueue.size());
         } else {
             metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
         }
@@ -466,8 +470,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Number of propagate operations done."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
-    ADD_STAT(numIdleCycles, statistics::units::Count::get(),
-             "Number of cycles PushEngine has been idle."),
+    // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
+    //          "Number of cycles PushEngine has been idle."),
     ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
              "Number of coalescions in the update queues."),
     ADD_STAT(numUpdates, statistics::units::Count::get(),
@@ -479,8 +483,12 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
              "Traversed Edges Per Second."),
     ADD_STAT(edgePointerQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the edgePointerQueue."),
+    ADD_STAT(edgePointerQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the edgePointerQueue."),
     ADD_STAT(edgeQueueLatency, statistics::units::Second::get(),
              "Histogram of the latency of the metaEdgeQueue."),
+    ADD_STAT(edgeQueueLength, statistics::units::Count::get(),
+             "Histogram of the size of the metaEdgeQueue."),
     ADD_STAT(updateQueueLength, statistics::units::Count::get(),
              "Histogram of the length of updateQueues."),
     ADD_STAT(numPropagatesHist, statistics::units::Count::get(),
@@ -496,7 +504,9 @@ PushEngine::PushStats::regStats()
     TEPS = numPropagates / simSeconds;
 
     edgePointerQueueLatency.init(64);
+    edgePointerQueueLength.init(64);
     edgeQueueLatency.init(64);
+    edgeQueueLength.init(64);
     updateQueueLength.init(64);
     numPropagatesHist.init(push.params().max_propagates_per_cycle);
 }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 08cceb14f0..f51865acb3 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -164,9 +164,10 @@ class PushEngine : public BaseMemoryEngine
 
       PushEngine &push;
 
+      statistics::Scalar numMemoryBlocks;
       statistics::Scalar numPropagates;
       statistics::Scalar numNetBlocks;
-      statistics::Scalar numIdleCycles;
+    //   statistics::Scalar numIdleCycles;
       statistics::Scalar updateQueueCoalescions;
       statistics::Scalar numUpdates;
       statistics::Scalar numWastefulEdgesRead;
@@ -174,7 +175,9 @@ class PushEngine : public BaseMemoryEngine
       statistics::Formula TEPS;
 
       statistics::Histogram edgePointerQueueLatency;
+      statistics::Histogram edgePointerQueueLength;
       statistics::Histogram edgeQueueLatency;
+      statistics::Histogram edgeQueueLength;
       statistics::Histogram updateQueueLength;
       statistics::Histogram numPropagatesHist;
     };

From b68602b864a995b5d5a248fb5364f973fc2ace3b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 07:36:05 -0800
Subject: [PATCH 213/287] Adding state.

---
 configs/accl/bfs.py                    | 35 ++++++++++++++++++++------
 configs/accl/sega.py                   |  6 +----
 src/accl/graph/sega/PushEngine.py      |  4 +--
 src/accl/graph/sega/coalesce_engine.cc | 26 +++++++++++++------
 src/accl/graph/sega/coalesce_engine.hh |  1 +
 src/accl/graph/sega/work_directory.hh  | 10 +++++---
 6 files changed, 57 insertions(+), 25 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index a201acd4d1..80331e3aad 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -47,6 +47,14 @@ def get_inputs():
         default=False,
         help="Print final answer",
     )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample statistics",
+    )
 
     args = argparser.parse_args()
 
@@ -56,24 +64,37 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample,
         args.verify,
     )
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, verify = get_inputs()
+    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
-    system.create_pop_count_directory(256)
+    system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(10000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7baa27fd5e..29a017ba65 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -61,6 +61,7 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             push_req_queue_size=32,
             attached_memory_atom_size=64,
             resp_queue_size=4096,
+            max_propagates_per_cycle=8,
             update_queue_size=32,
         )
 
@@ -73,11 +74,6 @@ def __init__(self, edge_memory_size: str, cache_size: str):
                 range=AddrRange(edge_memory_size), in_addr_map=False
             )
         )
-        # self.edge_mem_ctrl = SimpleMemory(latency="90ns",
-        #                                 latency_var="0ns",
-        #                                 bandwidth="18GiB/s",
-        #                                 range=AddrRange(edge_memory_size),
-        #                                 in_addr_map=False)
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 20c5452d43..63fa1eae62 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -42,8 +42,8 @@ class PushEngine(BaseMemoryEngine):
                                     "push engine where it stores the "
                                     "edges read from memory.")
 
-    max_propagates_per_cycle = Param.Int(4, "Maximum number of propagates "
-                                            "done per cycle.")
+    max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+                                                        "done per cycle.")
 
     update_queue_size = Param.Int("Maximum number of entries "
                                     "for each update queue.")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index d7cf173097..adb33064f7 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -239,7 +239,7 @@ CoalesceEngine::recvWLRead(Addr addr)
         // miss
         assert(cacheBlocks[block_index].addr != aligned_addr);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
-
+        stats.readMisses++;
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
             // conflict miss
             DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
@@ -268,7 +268,9 @@ CoalesceEngine::recvWLRead(Addr addr)
                     }
                     if (atom_active) {
                         activeCacheBlocks.erase(block_index);
-                        directory->activate(cacheBlocks[block_index].addr);
+                        int count = directory->activate(cacheBlocks[block_index].addr);
+                        stats.blockActiveCount.sample(count);
+                        stats.frontierSize.sample(directory->workCount());
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -301,7 +303,6 @@ CoalesceEngine::recvWLRead(Addr addr)
             }
             return ReadReturnStatus::ACCEPT;
         }
-        stats.readMisses++;
     }
 }
 
@@ -376,8 +377,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                                             cacheBlocks[block_index].items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeCacheBlocks.push_back(block_index);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -433,8 +436,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active |= graphWorkload->activeCondition(items[index]);
             }
             if (atom_active) {
-                directory->deactivate(addr);
+                int count = directory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
                 DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
                         "activeBuffer.size: %d.\n", __func__,
                         pkt->print(), activeBuffer.size());
@@ -591,7 +596,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 }
                 if (atom_active) {
                     activeCacheBlocks.erase(block_index);
-                    directory->activate(cacheBlocks[block_index].addr);
+                    int count = directory->activate(cacheBlocks[block_index].addr);
+                    stats.blockActiveCount.sample(count);
+                    stats.frontierSize.sample(directory->workCount());
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -804,7 +811,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         }
         if (atom_active) {
             activeCacheBlocks.erase(block_index);
-            directory->activate(cacheBlocks[block_index].addr);
+            int count = directory->activate(cacheBlocks[block_index].addr);
+            stats.blockActiveCount.sample(count);
+            stats.frontierSize.sample(directory->workCount());
         }
 
         PacketPtr pkt = createWritePacket(
@@ -1081,6 +1090,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Rate at which vertices are pushed."),
     ADD_STAT(frontierSize, statistics::units::Count::get(),
              "Histogram of the length of the bitvector."),
+    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1101,6 +1112,7 @@ CoalesceEngine::CoalesceStats::regStats()
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
     frontierSize.init(64);
+    blockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f87e0027a2..b855fda38b 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -179,6 +179,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Formula vertexPushBW;
 
         statistics::Histogram frontierSize;
+        statistics::Histogram blockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 4102e29cd3..35778686c8 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -38,8 +38,8 @@ namespace gem5
 class WorkDirectory
 {
   public:
-    virtual void activate(Addr atom_addr) = 0;
-    virtual void deactivate(Addr atom_addr) = 0;
+    virtual int activate(Addr atom_addr) = 0;
+    virtual int deactivate(Addr atom_addr) = 0;
     virtual Addr getNextWork() = 0;
 
     virtual int workCount() = 0;
@@ -99,7 +99,7 @@ class PopCountDirectory: public WorkDirectory
 
     // CAUTION: This should only be called when the work
     // directory **is not** tracking the the atom with atom_addr
-    virtual void activate(Addr atom_addr)
+    virtual int activate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -107,11 +107,12 @@ class PopCountDirectory: public WorkDirectory
         _workCount++;
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     // CAUTION: This should only be called when the work
     // directory **is** tracking the the atom with atom_addr
-    virtual void deactivate(Addr atom_addr)
+    virtual int deactivate(Addr atom_addr)
     {
         int index = getIndexFromAtomAddr(atom_addr);
         uint32_t prev_count = popCount[index];
@@ -119,6 +120,7 @@ class PopCountDirectory: public WorkDirectory
         _workCount--;
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
+        return popCount[index];
     }
 
     virtual int workCount() { return _workCount; }

From ec5025f2b3b1143ed9c1663e47464d937705ded3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:00:00 -0800
Subject: [PATCH 214/287] Adding stat to count number of conflict misses.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 +++
 src/accl/graph/sega/coalesce_engine.hh | 1 +
 2 files changed, 4 insertions(+)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index adb33064f7..8c636615cd 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -279,6 +279,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 }
                 return ReadReturnStatus::REJECT_NO_ROLL;
             } else {
+                stats.numConflicts++;
                 return ReadReturnStatus::REJECT_ROLL;
             }
         } else {
@@ -1065,6 +1066,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Number of cache misses."),
     ADD_STAT(readHitUnderMisses, statistics::units::Count::get(),
              "Number of cache hit under misses."),
+    ADD_STAT(numConflicts, statistics::units::Count::get(),
+             "Number of conflicts raised by reads in the cache."),
     ADD_STAT(responsePortShortage, statistics::units::Count::get(),
              "Number of times a response has been "
              "delayed because of port shortage. "),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b855fda38b..c2da6a90cd 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -166,6 +166,7 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar readHits;
         statistics::Scalar readMisses;
         statistics::Scalar readHitUnderMisses;
+        statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
         statistics::Scalar verticesPulled;

From ca971137593af82054c428ea6d8bca8e949463d0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 15:17:20 -0800
Subject: [PATCH 215/287] Adding stat to count the number of update rolls.

---
 src/accl/graph/sega/coalesce_engine.cc | 3 ---
 src/accl/graph/sega/enums.cc           | 3 +--
 src/accl/graph/sega/enums.hh           | 1 -
 src/accl/graph/sega/wl_engine.cc       | 4 ++++
 src/accl/graph/sega/wl_engine.hh       | 1 +
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8c636615cd..b9ac25c502 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -179,9 +179,6 @@ CoalesceEngine::recvWLRead(Addr addr)
     if ((cacheBlocks[block_index].addr == aligned_addr) &&
         (cacheBlocks[block_index].valid)) {
         // Hit
-        if (cacheBlocks[block_index].state == CacheState::LOCKED_FOR_APPLY) {
-            return ReadReturnStatus::REJECT_NO_ROLL;
-        }
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a hit.\n", __func__, addr);
         stats.readHits++;
         assert(cacheBlocks[block_index].state != CacheState::INVALID);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 8c9d223178..de5d569c18 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -36,8 +36,7 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_DATA",
     "BUSY",
     "IDLE",
-    "PENDING_WB",
-    "LOCKED_FOR_APPLY"
+    "PENDING_WB"
 };
 
 
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index e7a8f84452..6153386b71 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -39,7 +39,6 @@ enum CacheState
     BUSY,
     IDLE,
     PENDING_WB,
-    LOCKED_FOR_APPLY,
     NUM_CACHE_STATE
 };
 extern const char* cacheStateStrings[NUM_CACHE_STATE];
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 2b305e1557..ed91622b43 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -224,6 +224,7 @@ WLEngine::processNextReadEvent()
                                         update_addr, update_value, enter_tick);
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                         "Rolling the update.\n", __func__);
+                    stats.numUpdateRolls++;
                 } else {
                     DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                     "Not rolling the update.\n", __func__);
@@ -330,6 +331,9 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
              "stalled because of register shortage"),
+    ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
+             "Number of times an update has been rolled back "
+             "to the back of the update queue due to cache reject."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
     ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index b5ad3d9040..45baaa1e79 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -101,6 +101,7 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar numReduce;
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
+      statistics::Scalar numUpdateRolls;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From fd1561f7435537165a458e3aac7afded87904475 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 19:47:35 -0800
Subject: [PATCH 216/287] Removing unnecessary comments.

---
 src/accl/graph/sega/coalesce_engine.cc | 52 +++-----------------------
 1 file changed, 5 insertions(+), 47 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index b9ac25c502..98229dde24 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -86,6 +86,9 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         int block_index = getBlockIndex(addr);
 
         // FIXME: Check postPushWBQueue for hits
+        // Is it really the case though. I don't think at this time
+        // beacuse we check done after handleMemResp and make sure all
+        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -438,23 +441,10 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 activeBuffer.emplace_back(pkt, curTick());
                 stats.blockActiveCount.sample(count);
                 stats.frontierSize.sample(directory->workCount());
-                DPRINTF(MSDebug, "%s: Empalced pkt: %s in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             } else {
                 delete pkt;
             }
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     if ((!nextMemoryEvent.pending()) &&
-            //         (!nextMemoryEvent.scheduled())) {
-            //         schedule(nextMemoryEvent, nextCycle());
-            //     }
-            //     pullsScheduled++;
-            // }
+
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -685,9 +675,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
             delete wb_pkt;
-            DPRINTF(MSDebug, "%s: Found addr: %lu in postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, postPushWBQueue.size());
         } else {
             wb++;
         }
@@ -707,16 +694,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
             delete ab_pkt;
-            // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-            //     memoryFunctionQueue.emplace_back(
-            //         [this] (int ignore, Tick schedule_tick) {
-            //             processNextVertexPull(ignore, schedule_tick);
-            //         }, 0, curTick());
-            //     pullsScheduled++;
-            // }
-            DPRINTF(MSDebug, "%s: Found addr: %lu in activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        cacheBlocks[block_index].addr, activeBuffer.size());
             if (pullCondition()) {
                 memoryFunctionQueue.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
@@ -841,6 +818,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
     if (postPushWBQueue.empty()) {
         return;
     }
+
     PacketPtr wb_pkt;
     Tick pkt_tick;
     std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
@@ -848,9 +826,6 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
         memPort.sendPacket(wb_pkt);
         onTheFlyReqs++;
         postPushWBQueue.pop_front();
-        DPRINTF(MSDebug, "%s: Popped pkt: %s from postPushWBQueue. "
-                        "postPushWBQueue.size: %d.\n", __func__,
-                        wb_pkt->print(), postPushWBQueue.size());
     }
 }
 
@@ -958,13 +933,7 @@ CoalesceEngine::processNextApplyEvent()
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
-            DPRINTF(MSDebug, "%s: Empalced pkt: %s in postPushWBQueue. "
-                            "postPushWBQueue.size: %d.\n", __func__,
-                            wb_pkt->print(), postPushWBQueue.size());
             activeBuffer.pop_front();
-            DPRINTF(MSDebug, "%s: Popped pkt: %s from activeBuffer. "
-                        "activeBuffer.size: %d.\n", __func__,
-                        pkt->print(), activeBuffer.size());
             memoryFunctionQueue.emplace_back(
                 [this] (int ignore, Tick schedule_tick) {
                     processNextPostPushWB(ignore, schedule_tick);
@@ -1020,17 +989,6 @@ CoalesceEngine::processNextApplyEvent()
                         "work to apply.\n", __func__);
     }
 
-    // if (workLeftInMem() && timeToPull() && canSchedulePull()) {
-    //     memoryFunctionQueue.emplace_back(
-    //         [this] (int ignore, Tick schedule_tick) {
-    //             processNextVertexPull(ignore, schedule_tick);
-    //         }, 0, curTick());
-    //     if ((!nextMemoryEvent.pending()) &&
-    //         (!nextMemoryEvent.scheduled())) {
-    //         schedule(nextMemoryEvent, nextCycle());
-    //     }
-    //     pullsScheduled++;
-    // }
     if (pullCondition()) {
         memoryFunctionQueue.emplace_back(
             [this] (int ignore, Tick schedule_tick) {

From 1124f5be5c9272df474387555d95f4e0603486c1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 8 Nov 2022 22:17:20 -0800
Subject: [PATCH 217/287] Removing comments.

---
 src/accl/graph/sega/work_directory.hh | 103 ++++++++------------------
 1 file changed, 30 insertions(+), 73 deletions(-)

diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 35778686c8..18430aee0d 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -29,6 +29,9 @@
 #ifndef __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 #define __ACCL_GRAPH_SEGA_WORK_DIRECTORY_HH__
 
+#include <iostream>
+
+#include "accl/graph/base/data_structs.hh"
 #include "base/addr_range.hh"
 #include "base/types.hh"
 
@@ -63,9 +66,11 @@ class PopCountDirectory: public WorkDirectory
     int lastCounterIndex;
     uint32_t* popCount;
 
-    uint32_t currentIndex;
+    uint32_t prevIndex;
     uint32_t currentCounter;
 
+    UniqueFIFO<int> activeBlockIndices;
+
     int getIndexFromAtomAddr(Addr atom_addr)
     {
         assert((atom_addr % memoryAtomSize) == 0);
@@ -86,7 +91,7 @@ class PopCountDirectory: public WorkDirectory
         WorkDirectory(),
         memoryRange(mem_range), numAtomsPerBlock(atoms_per_block),
         memoryAtomSize(atom_size), _workCount(0),
-        currentIndex(0), currentCounter(0)
+        prevIndex(-1), currentCounter(0)
     {
         blockSize = numAtomsPerBlock * memoryAtomSize;
         int numCounters = (int) (memoryRange.size() / blockSize);
@@ -105,6 +110,7 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]++;
         _workCount++;
+        activeBlockIndices.push_back(index);
         assert(popCount[index] > prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -118,6 +124,9 @@ class PopCountDirectory: public WorkDirectory
         uint32_t prev_count = popCount[index];
         popCount[index]--;
         _workCount--;
+        if (popCount[index] == 0) {
+            activeBlockIndices.erase(index);
+        }
         assert(popCount[index] < prev_count);
         assert(popCount[index] <= numAtomsPerBlock);
         return popCount[index];
@@ -130,80 +139,28 @@ class PopCountDirectory: public WorkDirectory
         lastCounterIndex = getIndexFromAtomAddr(atom_addr);
     }
 
-    // CAUTION: If this function returns an addr that
-    // is in the cache, that addr should be ignored.
-    // CAUTION: The receiver should track the last n
-    // addresses that this WorkDirectory has generated.
-    // where n is equal to the size of the entry holding
-    // reads generated by this WorkDirectory. In case
-    // the WorkDirectory generates a repeated address
-    // it should be ignored.
-    // FIXME: This should return garbage if it can't find anything.
-    // virtual Addr getNextWork()
-    // {
-    //     if ((currentCounter == numAtomsPerBlock) ||
-    //         (popCount[currentIndex] == 0)) {
-    //         int prev_index = currentIndex;
-    //         while (true) {
-    //             currentIndex++;
-    //             // NOTE: this is an optimization.
-    //             // lastCounterIndex tracks the last blockOfAtom that
-    //             // has vertices. By default it is set to numCounters - 1.
-    //             // However, it might not be necessary to track all the
-    //             // numCounters counters. e.g. If this WorkDirectory is tracking
-    //             // a 512 MiB memory with atom size of 32 B and 256 atoms
-    //             // per block. Then it needs 64 Ki counters of 8 bit wide.
-    //             // However, if we need 8 Mi atoms to store all our vertices,
-    //             // the second half of the counters would not be used at all
-    //             // (512 MiB hold 16 Mi atoms and we're only using half).
-    //             if (currentIndex > lastCounterIndex) {
-    //                 currentIndex = 0;
-    //             }
-    //             if (prev_index == currentIndex) {
-    //                 // NOTE: If we have reached the same index as before,
-    //                 // we need to decrement the currentCounter to generate
-    //                 // a repeatative address. This way the receiver can detect
-    //                 // the uselessness of the generated address and ignore it
-    //                 currentCounter--;
-    //                 break;
-    //             }
-    //             if (popCount[currentIndex] > 0) {
-    //                 currentCounter = 0;
-    //                 break;
-    //             }
-    //         }
-    //     }
-    //     Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
-    //     currentCounter++;
-
-    //     return ret_addr;
-    // }
-
+    // CAUTION: This directory only tracks active vertices in the memory
+    // and it does not have any information on the state of the cache and/or
+    // the active buffer or the write buffer. Therefore, it might generate a
+    // read request to an address that might be in any of those. In that case,
+    // the generated address should be ignored.
     virtual Addr getNextWork()
     {
-        if ((currentCounter == numAtomsPerBlock) ||
-            (popCount[currentIndex] == 0)) {
-            int other_count = _workCount - popCount[currentIndex];
-            if (other_count == 0) {
-                currentCounter = 0;
-            } else {
-                int prev_index = currentIndex;
-                while (true) {
-                    currentIndex++;
-                    if (currentIndex > lastCounterIndex) {
-                        currentIndex = 0;
-                    }
-                    if (currentIndex == prev_index) {
-                        break;
-                    }
-                    if (popCount[currentIndex] > 0) {
-                        break;
-                    }
-                }
-                currentCounter = 0;
-            }
+        // Why ask directory if it's empty?
+        assert(!activeBlockIndices.empty());
+        int front_index = activeBlockIndices.front();
+        assert(popCount[front_index] > 0);
+        if ((prevIndex != -1) && (prevIndex != front_index)) {
+            currentCounter = 0;
+        }
+        if (currentCounter == numAtomsPerBlock) {
+            currentCounter = 0;
+            activeBlockIndices.pop_front();
+            activeBlockIndices.push_back(front_index);
         }
-        Addr ret_addr = getAtomAddrFromIndex(currentIndex, currentCounter);
+        int current_index = activeBlockIndices.front();
+        Addr ret_addr = getAtomAddrFromIndex(current_index, currentCounter);
+        prevIndex = current_index;
         currentCounter++;
         return ret_addr;
     }

From c2b08a68d27767737a489c72e7fcf7d80be10bc2 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 09:05:29 -0800
Subject: [PATCH 218/287] Adding pr and updating config scripts.

---
 configs/accl/bfs.py                        |  24 ++--
 configs/accl/pr-sample.py                  | 109 --------------
 configs/accl/pr.py                         |  44 +++++-
 configs/accl/sega.py                       |  36 +++--
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++------------
 src/accl/graph/base/graph_workload.hh      |  38 ++---
 src/accl/graph/sega/CenteralController.py  |   2 +-
 src/accl/graph/sega/CoalesceEngine.py      |   1 +
 src/accl/graph/sega/centeral_controller.cc |  10 +-
 src/accl/graph/sega/centeral_controller.hh |   2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  53 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   5 +-
 12 files changed, 201 insertions(+), 280 deletions(-)
 delete mode 100644 configs/accl/pr-sample.py

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 80331e3aad..829449c599 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -40,20 +40,20 @@ def get_inputs():
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
-        "--verify",
-        dest="verify",
+        "--sample",
+        dest="sample",
         action="store_const",
         const=True,
         default=False,
-        help="Print final answer",
+        help="Sample sim stats every 100us",
     )
     argparser.add_argument(
-        "--sample",
-        dest="sample",
+        "--verify",
+        dest="verify",
         action="store_const",
         const=True,
         default=False,
-        help="Sample statistics",
+        help="Print final answer",
     )
 
     args = argparser.parse_args()
@@ -70,7 +70,15 @@ def get_inputs():
 
 
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value, sample, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
@@ -81,7 +89,7 @@ def get_inputs():
     system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
-            exit_event = m5.simulate(10000000)
+            exit_event = m5.simulate(100000000)
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
diff --git a/configs/accl/pr-sample.py b/configs/accl/pr-sample.py
deleted file mode 100644
index ac3616dc84..0000000000
--- a/configs/accl/pr-sample.py
+++ /dev/null
@@ -1,109 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from sega import SEGA
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 10us",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.cache_size,
-        args.graph,
-        args.alpha,
-        args.threshold,
-        args.verify,
-        args.sample,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        cache_size,
-        graph,
-        alpha,
-        threshold,
-        verify,
-        sample,
-    ) = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.create_pr_workload(alpha, threshold)
-
-    if sample:
-        while True:
-            exit_event = m5.simulate(10000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            print(exit_event.getCause())
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 59e8b924c6..e852e47561 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
     argparser.add_argument(
         "--verify",
         dest="verify",
@@ -56,23 +64,45 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.sample,
         args.verify,
     )
 
-
 if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, alpha, threshold, verify = get_inputs()
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        sample,
+        verify,
+    ) = get_inputs()
 
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
+    system.create_pop_count_directory(64)
     system.create_pr_workload(alpha, threshold)
-    exit_event = m5.simulate()
-    print(
-        f"Exited simulation at tick {m5.curTick()} "
-        + f"because {exit_event.getCause()}"
-    )
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            print(exit_event.getCause())
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 29a017ba65..7831302228 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,14 +47,18 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
+    ):
         super().__init__()
+        self._simple_mem = simple_mem
         self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            active_buffer_size = 64,
+            pending_pull_limit=32,
+            active_buffer_size=64,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -65,9 +69,15 @@ def __init__(self, edge_memory_size: str, cache_size: str):
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(), dram_2=HBM_2000_4H_1x64()
-        )
+        if self._simple_mem:
+            self.vertex_mem_ctrl = SimpleMemory(
+                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            )
+        else:
+            self.vertex_mem_ctrl = HBMCtrl(
+                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+            )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
@@ -96,18 +106,20 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        if self._simple_mem:
+            self.vertex_mem_ctrl.range = vertex_ranges[0]
+        else:
+            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
+        if self._simple_mem:
+            pass
+        else:
+            self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
-    # def set_edge_image(self, edge_image):
-    #     self.edge_mem_ctrl.image_file = edge_image
-
-
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 446509201f..0539296cce 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,92 +113,75 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
-// PRWorkload::PRWorkload(float alpha, float threshold, int atom_size):
-//     GraphWorkload(), alpha(alpha), threshold(threshold), atomSize(atom_size)
-// {
-//     numElementsPerLine = (int) (atomSize / sizeof(WorkListItem));
-// }
-
-// void
-// PRWorkload::init(PacketPtr pkt, int bit_index_base,
-//                 std::bitset<MAX_BITVECTOR_SIZE>& needsPush,
-//                 std::deque<int>& activeBits,
-//                 int& _workCount)
-// {
-//     WorkListItem items[numElementsPerLine];
-
-//     pkt->writeDataToBlock((uint8_t*) items, atomSize);
-//     for (int i = 0; i < numElementsPerLine; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         if (items[i].degree > 0) {
-//             needsPush[bit_index_base + i] = 1;
-//             activeBits.push_back(bit_index_base + i);
-//             _workCount++;
-//         }
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, atomSize);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = 1.0;
-
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::applyCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return dist >= threshold;
-// }
-
-// bool
-// PRWorkload::preWBApply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl) && (wl.degree > 0)) {
-//         return true;
-//     }
-//     return false;
-// }
-
-// std::tuple<uint32_t, bool, bool>
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     if (applyCondition(wl)) {
-//         float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//         float prop_float = writeToFloat<uint32_t>(wl.prop);
-//         float delta = (temp_float - prop_float) / wl.degree;
-//         uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//         wl.prop = wl.tempProp;
-//         return std::make_tuple(delta_uint, true, true);
-//     }
-//     return std::make_tuple(0, false, false);
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, temp_float, wl.degree, wl.edgeIndex
-//             );
-// }
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = readFromFloat<uint32_t>(0);
+        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return dist >= threshold;
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+    wl.prop = wl.tempProp;
+    return delta_uint;
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex);
+}
 
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f71955bd16..f335ad9b47 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -77,24 +77,26 @@ class BFSWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
-
-//   public:
-//     PRWorkload(float alpha, float threshold);
-
-//     ~PRWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 0c21833a05..09a997696d 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -43,6 +43,6 @@ class CenteralController(ClockedObject):
 
     cxx_exports = [
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 76e7d262e8..c2393c2f1e 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -39,6 +39,7 @@ class CoalesceEngine(BaseMemoryEngine):
 
     max_resp_per_cycle = Param.Int("Maximum number of vertices to send to "
                                 "requestor in each cycle. Used to limit b/w.")
+    pending_pull_limit = Param.Int("Maximum number of pending pull processes.")
     active_buffer_size = Param.Int("Maximum number of memory active memory "
                                 "atoms ready to send updates. This parameter "
                                 "and post_push_wb_queue_size should be set "
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 883992e64e..60c78559e4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -110,11 +110,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold, system->cacheLineSize());
-// }
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
 
 void
 CenteralController::recvDoneSignal()
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 6eb07dbcac..ae2980d050 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -63,7 +63,7 @@ class CenteralController : public ClockedObject
     virtual void startup() override;
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 98229dde24..8ac40198be 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,10 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0),
-    maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0), pendingPullReads(0),
-    activeBufferSize(params.active_buffer_size),
+    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0),
+    pendingPullLimit(params.pending_pull_limit),
+    pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
@@ -129,29 +129,17 @@ CoalesceEngine::done()
 }
 
 bool
-CoalesceEngine::timeToPull()
+CoalesceEngine::enoughSpace()
 {
-    return (activeBuffer.size() + pendingPullReads) < activeBufferSize;
-}
-
-bool
-CoalesceEngine::canSchedulePull()
-{
-    // TODO: Maybe a good idea to change this to
-    // activeBuffer.size() + pendingPullReads + pullsScheduled < activeBufferSize
-    return pullsScheduled < 1;
-}
-
-bool
-CoalesceEngine::workLeftInMem()
-{
-    return !directory->empty();
+    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
 }
 
 bool
 CoalesceEngine::pullCondition()
 {
-    return ((activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize);
+    bool enough_space = enoughSpace();
+    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    return enough_space && schedule_limit;
 }
 
 // addr should be aligned to peerMemoryAtomSize
@@ -784,12 +772,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
             atom_active |= graphWorkload->activeCondition(
                                         cacheBlocks[block_index].items[index]);
         }
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
-            int count = directory->activate(cacheBlocks[block_index].addr);
-            stats.blockActiveCount.sample(count);
-            stats.frontierSize.sample(directory->workCount());
-        }
 
         PacketPtr pkt = createWritePacket(
                 cacheBlocks[block_index].addr, peerMemoryAtomSize,
@@ -797,8 +779,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        memPort.sendPacket(pkt);
-        onTheFlyReqs++;
+        if (atom_active) {
+            activeCacheBlocks.erase(block_index);
+            if (enoughSpace()) {
+                activeBuffer.emplace_back(pkt, curTick());
+            } else {
+                int count = directory->activate(cacheBlocks[block_index].addr);
+                stats.blockActiveCount.sample(count);
+                stats.frontierSize.sample(directory->workCount());
+                memPort.sendPacket(pkt);
+                onTheFlyReqs++;
+            }
+        } else {
+            memPort.sendPacket(pkt);
+            onTheFlyReqs++;
+        }
         cacheBlocks[block_index].reset();
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c2da6a90cd..f605704b6d 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -118,6 +118,7 @@ class CoalesceEngine : public BaseMemoryEngine
     UniqueFIFO<int> activeCacheBlocks;
 
     int pullsScheduled;
+    int pendingPullLimit;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
     // send for push when getting the read response from memory.
@@ -128,9 +129,7 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<PacketPtr, Tick>> activeBuffer;
     std::deque<std::tuple<PacketPtr, Tick>> postPushWBQueue;
 
-    bool timeToPull();
-    bool canSchedulePull();
-    bool workLeftInMem();
+    bool enoughSpace();
     bool pullCondition();
     int getBlockIndex(Addr addr);
 

From ccaa539854ee30fc4ea9e6289968ddcf9700edf1 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 9 Nov 2022 21:24:39 -0800
Subject: [PATCH 219/287] Updating activeCondition for PR.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 0539296cce..05c8d05089 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -160,7 +160,7 @@ PRWorkload::activeCondition(WorkListItem wl)
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     float dist = std::abs(temp_float - prop_float);
-    return dist >= threshold;
+    return (dist >= threshold) && (wl.degree > 0);
 }
 
 uint32_t

From 3747d9f40e7dd23a7e958621090b02ba58cd79c9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 13 Nov 2022 15:36:40 -0800
Subject: [PATCH 220/287] Adding SSSP and CC

---
 src/accl/graph/base/graph_workload.cc | 172 ++++++++++++++++++++++++++
 src/accl/graph/base/graph_workload.hh |  58 +++++++++
 2 files changed, 230 insertions(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 05c8d05089..e36c074da9 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -113,6 +113,121 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             );
 }
 
+void
+BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return 1;
+}
+
+bool
+BFSVisitedWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+BFSVisitedWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+void
+SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    size_t pkt_size = pkt->getSize();
+    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+    if (pkt->getAddr() == aligned_addr) {
+        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+        WorkListItem items[num_elements];
+
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+        items[index].tempProp = initValue;
+        if (activeCondition(items[index])) {
+            dir->activate(aligned_addr);
+        }
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+SSSPWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+SSSPWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value + weight;
+}
+
+bool
+SSSPWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+SSSPWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+SSSPWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
+
 void
 PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -184,4 +299,61 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
 
+void
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    Addr pkt_addr = pkt->getAddr();
+    size_t pkt_size = pkt->getSize();
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
+
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
+        items[i].prop = -1;
+        atom_active |= activeCondition(items[i]);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+}
+
+uint32_t
+CCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    return std::min(update, value);
+}
+
+uint32_t
+CCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+bool
+CCWorkload::activeCondition(WorkListItem wl)
+{
+    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+}
+
+uint32_t
+CCWorkload::apply(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    return wl.prop;
+}
+
+std::string
+CCWorkload::printWorkListItem(const WorkListItem wl)
+{
+    return csprintf(
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+            );
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index f335ad9b47..de2877d6e8 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -76,6 +76,48 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class BFSVisitedWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~BFSVisitedWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
+class SSSPWorkload : public GraphWorkload
+{
+  private:
+    uint64_t initAddr;
+    uint32_t initValue;
+
+  public:
+    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
+        initAddr(init_addr), initValue(init_value)
+    {}
+
+    ~SSSPWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 
 class PRWorkload : public GraphWorkload
 {
@@ -98,6 +140,22 @@ class PRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
+class CCWorkload : public GraphWorkload
+{
+
+  public:
+    CCWorkload() {}
+
+    ~CCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 }
 
 #endif // __ACCL_GRAPH_BASE_GRAPH_WORKLOAD_HH__

From 000103e41a94e4baf407eca22e44c3aabb0fe972 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 11 Nov 2022 14:40:50 -0800
Subject: [PATCH 221/287] Adding option to use SimpleMemory for vertex memory.

---
 configs/accl/bfs.py                           |  17 ++-
 configs/accl/pr.py                            |  20 ++-
 configs/accl/real-graph-gen.py                |  16 ++-
 configs/accl/sega.py                          |  34 ++---
 .../accl/{sega-simple.py => sega_simple.py}   | 133 ++++++++----------
 5 files changed, 113 insertions(+), 107 deletions(-)
 rename configs/accl/{sega-simple.py => sega_simple.py} (50%)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 829449c599..806aa8a915 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,6 +71,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.simple,
         args.sample,
         args.verify,
     )
@@ -76,10 +84,15 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e852e47561..e3d7c764ad 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -24,7 +24,6 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-from sega import SEGA
 
 import m5
 import argparse
@@ -39,6 +38,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -64,10 +71,12 @@ def get_inputs():
         args.graph,
         args.alpha,
         args.threshold,
+        args.simple,
         args.sample,
         args.verify,
     )
 
+
 if __name__ == "__m5_main__":
     (
         num_gpts,
@@ -75,10 +84,15 @@ def get_inputs():
         graph,
         alpha,
         threshold,
+        simple,
         sample,
         verify,
     ) = get_inputs()
-
+    
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
     system = SEGA(num_gpts, cache_size, graph)
     root = Root(full_system=False, system=system)
 
@@ -95,7 +109,6 @@ def get_inputs():
             )
             m5.stats.dump()
             m5.stats.reset()
-            print(exit_event.getCause())
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
@@ -106,3 +119,4 @@ def get_inputs():
         )
     if verify:
         system.print_answer()
+
diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
index b943a925c1..332bb67452 100644
--- a/configs/accl/real-graph-gen.py
+++ b/configs/accl/real-graph-gen.py
@@ -45,8 +45,11 @@ def get_inputs():
 if __name__ == "__main__":
     graph_path, num_gpts = get_inputs()
 
+    graph_sorter = os.environ.get("GRAPH_SORTER")
     graph_reader = os.environ.get("GRAPH_READER")
 
+    if graph_sorter is None:
+        raise ValueError(f"No value for $GRAPH_SORTER.")
     if graph_reader is None:
         raise ValueError(f"No value for $GRAPH_READER.")
 
@@ -54,6 +57,17 @@ def get_inputs():
         raise ValueError(f"{graph_path} does not exist.")
 
     graph_dir = os.path.dirname(graph_path)
+    sorted_graph = f"{graph_dir}/sorted_graph.txt"
+    if not os.path.exists(sorted_graph):
+        print(f"Sorting {graph_path} into {sorted_graph}.")
+        subprocess.run(
+            [
+                "python",
+                f"{graph_sorter}",
+                f"{graph_path}",
+                f"{sorted_graph}",
+            ]
+        )
     if not "binaries" in os.listdir(graph_dir):
         print(f"binaries directory not found in {graph_dir}")
         os.mkdir(f"{graph_dir}/binaries")
@@ -80,7 +94,7 @@ def get_inputs():
         subprocess.run(
             [
                 f"{graph_reader}",
-                f"{graph_path}",
+                f"{sorted_graph}",
                 "false",
                 f"{num_gpts}",
                 "32",
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 7831302228..1ea36ea49e 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -48,11 +48,9 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 class GPT(SubSystem):
     def __init__(
-        self, edge_memory_size: str, cache_size: str, simple_mem: bool = False
-    ):
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self._simple_mem = simple_mem
-        self.wl_engine = WLEngine(update_queue_size=128, register_file_size=64)
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
@@ -69,20 +67,14 @@ def __init__(
             update_queue_size=32,
         )
 
-        if self._simple_mem:
-            self.vertex_mem_ctrl = SimpleMemory(
-                latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
-            )
-        else:
-            self.vertex_mem_ctrl = HBMCtrl(
-                dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-                dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-            )
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
+            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
+        )
 
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False
-            )
+                range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -106,17 +98,11 @@ def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_ranges):
-        if self._simple_mem:
-            self.vertex_mem_ctrl.range = vertex_ranges[0]
-        else:
-            self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-            self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
 
     def set_vertex_pch_bit(self, pch_bit):
-        if self._simple_mem:
-            pass
-        else:
-            self.vertex_mem_ctrl.pch_bit = pch_bit
+        self.vertex_mem_ctrl.pch_bit = pch_bit
 
     def set_edge_image(self, edge_image):
         self.edge_mem_ctrl.dram.image_file = edge_image
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega_simple.py
similarity index 50%
rename from configs/accl/sega-simple.py
rename to configs/accl/sega_simple.py
index 7ec19c92ae..f59fa71a79 100644
--- a/configs/accl/sega-simple.py
+++ b/configs/accl/sega_simple.py
@@ -24,90 +24,87 @@
 # (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
 # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
 
-import m5
-import argparse
-
 from math import log
 from m5.objects import *
 
+
 def interleave_addresses(plain_range, num_channels, cache_line_size):
     intlv_low_bit = log(cache_line_size, 2)
     intlv_bits = log(num_channels, 2)
     ret = []
     for i in range(num_channels):
-        ret.append(AddrRange(
-            start=plain_range.start,
-            size=plain_range.size(),
-            intlvHighBit=intlv_low_bit + intlv_bits - 1,
-            xorHighBit=0,
-            intlvBits=intlv_bits,
-            intlvMatch=i))
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
     return ret
 
+
 class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
+    def __init__(
+        self, edge_memory_size: str, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
+        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
         self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
         self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=32,
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="0ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+        
+        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
+        
+        self.edge_mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(
+                range=AddrRange(edge_memory_size), in_addr_map=False)
+        )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
 
     def getRespPort(self):
         return self.wl_engine.in_ports
+
     def setRespPort(self, port):
         self.wl_engine.in_ports = port
 
     def getReqPort(self):
         return self.push_engine.out_ports
+
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
+
     def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
+        self.edge_mem_ctrl.dram.image_file = edge_image
 
 class SEGA(System):
     def __init__(self, num_mpus, cache_size, graph_path):
         super(SEGA, self).__init__()
         self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
+        self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
@@ -115,14 +112,12 @@ def __init__(self, num_mpus, cache_size, graph_path):
         self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
 
         vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
+            AddrRange(start=0, size="4GiB"), num_mpus, 32
+        )
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
+            gpt = GPT("4GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
@@ -134,32 +129,16 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+    def create_pop_count_directory(self, atoms_per_block):
+        for gpt in self.gpts:
+            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
 
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    m5.instantiate()
+    def create_pr_workload(self, alpha, threshold):
+        self.ctrl.createPRWorkload(alpha, threshold)
 
-    system.create_initial_bfs_update(init_addr, init_value)
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
 
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")

From 4b30d61b3a7b5261973467c478d2243da896d83b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:55:54 -0800
Subject: [PATCH 222/287] Removing graph gen scripts and moved to sega-utils.

---
 configs/accl/real-graph-gen.py  | 107 ------------------------
 configs/accl/synth-graph-gen.py | 139 --------------------------------
 2 files changed, 246 deletions(-)
 delete mode 100644 configs/accl/real-graph-gen.py
 delete mode 100644 configs/accl/synth-graph-gen.py

diff --git a/configs/accl/real-graph-gen.py b/configs/accl/real-graph-gen.py
deleted file mode 100644
index 332bb67452..0000000000
--- a/configs/accl/real-graph-gen.py
+++ /dev/null
@@ -1,107 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("path", type=str, help="Path to the graph file.")
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.path, args.num_gpts
-
-
-if __name__ == "__main__":
-    graph_path, num_gpts = get_inputs()
-
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    graph_reader = os.environ.get("GRAPH_READER")
-
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-
-    if not os.path.exists(graph_path):
-        raise ValueError(f"{graph_path} does not exist.")
-
-    graph_dir = os.path.dirname(graph_path)
-    sorted_graph = f"{graph_dir}/sorted_graph.txt"
-    if not os.path.exists(sorted_graph):
-        print(f"Sorting {graph_path} into {sorted_graph}.")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}",
-                f"{sorted_graph}",
-            ]
-        )
-    if not "binaries" in os.listdir(graph_dir):
-        print(f"binaries directory not found in {graph_dir}")
-        os.mkdir(f"{graph_dir}/binaries")
-        print(f"Created {graph_dir}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_dir}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_dir}/binaries")
-        os.mkdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_dir}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_dir}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_dir}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(f"Deleted all the files in {graph_dir}/binaries/gpts_{num_gpts}")
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{sorted_graph}",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_dir}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_dir}/binaries/gpts_{num_gpts}"
-        )
diff --git a/configs/accl/synth-graph-gen.py b/configs/accl/synth-graph-gen.py
deleted file mode 100644
index 15e4a6eff2..0000000000
--- a/configs/accl/synth-graph-gen.py
+++ /dev/null
@@ -1,139 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import os
-import argparse
-import subprocess
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument(
-        "scale", type=int, help="The scale of the synth graph to generate."
-    )
-    argparser.add_argument(
-        "deg",
-        type=int,
-        help="The average degree of the synth graph to generate.",
-    )
-    argparser.add_argument(
-        "num_gpts",
-        type=int,
-        help="Number gpts to create synth graph binaries for.",
-    )
-
-    args = argparser.parse_args()
-    return args.scale, args.deg, args.num_gpts
-
-
-if __name__ == "__main__":
-    scale, deg, num_gpts = get_inputs()
-
-    base_dir = os.environ.get("GRAPH_DIR", default="/tmp")
-    graph_gen = os.environ.get("GRAPH_GEN")
-    graph_reader = os.environ.get("GRAPH_READER")
-    graph_sorter = os.environ.get("GRAPH_SORTER")
-    if graph_gen is None:
-        raise ValueError(f"No value for $GRAPH_GEN.")
-    if graph_reader is None:
-        raise ValueError(f"No value for $GRAPH_READER.")
-    if graph_sorter is None:
-        raise ValueError(f"No value for $GRAPH_SORTER")
-
-    graph_path = os.path.join(base_dir, f"graph_{scale}_{deg}")
-    if not os.path.exists(graph_path):
-        print(f"{graph_path} does not exist already.")
-        os.mkdir(graph_path)
-        print(f"Created {graph_path}")
-
-    if not "graph.txt" in os.listdir(graph_path):
-        print(f"graph.txt not found in {graph_path}")
-        for delete in os.scandir(graph_path):
-            os.remove(delete.path)
-        print(f"Deleted everything in {graph_path}")
-        subprocess.run(
-            [
-                f"{graph_gen}",
-                f"{scale}",
-                f"{deg}",
-                f"{graph_path}/graph_unordered.txt",
-            ]
-        )
-        print(f"Generated a graph with scale " f"{scale} and deg {deg}")
-        subprocess.run(
-            [
-                "python",
-                f"{graph_sorter}",
-                f"{graph_path}/graph_unordered.txt",
-                f"{graph_path}/graph.txt",
-            ]
-        )
-        print(
-            f"Sorted the graph here {graph_path}/graph_unordered.txt"
-            f" and saved in {graph_path}/graph.txt"
-        )
-        subprocess.run(["rm", f"{graph_path}/graph_unordered.txt"])
-        print(f"Deleted {graph_path}/graph_unordered.txt")
-
-    if not "binaries" in os.listdir(graph_path):
-        print(f"binaries directory not found in {graph_path}")
-        os.mkdir(f"{graph_path}/binaries")
-        print(f"Created {graph_path}/binaries")
-
-    if not f"gpts_{num_gpts}" in os.listdir(f"{graph_path}/binaries"):
-        print(f"gpts_{num_gpts} not found in {graph_path}/binaries")
-        os.mkdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-        print(f"Created {graph_path}/binaries/gpts_{num_gpts}")
-
-    expected_bins = ["vertices"] + [f"edgelist_{i}" for i in range(num_gpts)]
-    if not all(
-        [
-            binary in os.listdir(f"{graph_path}/binaries/gpts_{num_gpts}")
-            for binary in expected_bins
-        ]
-    ):
-        print(
-            f"Not all expected binaries found in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        for delete in os.scandir(f"{graph_path}/binaries/gpts_{num_gpts}"):
-            os.remove(delete.path)
-        print(
-            f"Deleted all the files in {graph_path}/binaries/gpts_{num_gpts}"
-        )
-        subprocess.run(
-            [
-                f"{graph_reader}",
-                f"{graph_path}/graph.txt",
-                "false",
-                f"{num_gpts}",
-                "32",
-                f"{graph_path}/binaries/gpts_{num_gpts}",
-            ]
-        )
-        print(
-            f"Created the graph binaries in "
-            f"{graph_path}/binaries/gpts_{num_gpts}"
-        )

From e10ce6142d0a7e255121d14a2eefe2715756bc1c Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 12:57:19 -0800
Subject: [PATCH 223/287] Adding BSP mode.

---
 src/accl/graph/base/data_structs.hh        |  30 ++-
 src/accl/graph/base/graph_workload.hh      |   2 +-
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/CoalesceEngine.py      |   3 -
 src/accl/graph/sega/centeral_controller.cc |  66 ++++--
 src/accl/graph/sega/centeral_controller.hh |  10 +-
 src/accl/graph/sega/coalesce_engine.cc     | 257 ++++++++++++++-------
 src/accl/graph/sega/coalesce_engine.hh     |  17 +-
 src/accl/graph/sega/enums.cc               |  15 +-
 src/accl/graph/sega/enums.hh               |  18 ++
 src/accl/graph/sega/mpu.hh                 |   4 +
 11 files changed, 308 insertions(+), 117 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 84233ae39c..f09a0dd167 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -43,28 +43,34 @@ struct __attribute__ ((packed)) WorkListItem
 {
     uint32_t tempProp : 32;
     uint32_t prop : 32;
-    uint32_t degree : 32;
     uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeFuture: 1;
 
     std::string to_string()
     {
         return csprintf("WorkListItem{tempProp: %u, prop: %u, edgeIndex: %u, "
-                            "degree: %u}", tempProp, prop, edgeIndex, degree);
+                        "degree: %u, activeNow: %s, activeFuture: %s}",
+                        tempProp, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeFuture ? "true" : "false");
     }
 
     WorkListItem():
         tempProp(0),
         prop(0),
+        edgeIndex(0),
         degree(0),
-        edgeIndex(0)
+        activeNow(false),
+        activeFuture(false)
     {}
 
     WorkListItem(uint32_t temp_prop, uint32_t prop,
-                uint32_t degree, uint32_t edge_index):
-        tempProp(temp_prop),
-        prop(prop),
-        degree(degree),
-        edgeIndex(edge_index)
+                uint32_t degree, uint32_t edge_index,
+                bool active_now, bool active_future):
+        tempProp(temp_prop), prop(prop), edgeIndex(edge_index), degree(degree),
+        activeNow(active_now), activeFuture(active_future)
     {}
 
 };
@@ -158,6 +164,10 @@ class UniqueFIFO
         return fifo.size();
     }
 
+    void clear() {
+        fifo.clear();
+    }
+
     bool empty() {
         return fifo.empty();
     }
@@ -174,6 +184,10 @@ class UniqueFIFO
         assert(it != fifo.end());
         fifo.erase(it);
     }
+
+    void operator=(const UniqueFIFO<T>& rhs) {
+        fifo = rhs.fifo;
+    }
 };
 
 }
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index de2877d6e8..14a6561ae3 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,7 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
-    virtual bool activeCondition(WorkListItem wl) = 0;
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 09a997696d..8b43c90102 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -42,6 +42,9 @@ class CenteralController(ClockedObject):
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     cxx_exports = [
+                    PyBindMethod("setAsyncMode"),
+                    PyBindMethod("setBSPMode"),
+                    PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index c2393c2f1e..25f8a1c58b 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -27,7 +27,6 @@
 
 from m5.params import *
 from m5.proxy import *
-from m5.util.pybind import PyBindMethod
 from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
 class CoalesceEngine(BaseMemoryEngine):
@@ -48,5 +47,3 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
-
-    cxx_exports = [PyBindMethod("createPopCountDirectory")]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 60c78559e4..6c924a4703 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -42,7 +42,9 @@ namespace gem5
 
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
-    system(params.system)
+    system(params.system),
+    mode(ProcessingMode::NOT_SET),
+    state(BulkSynchronousState::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -50,11 +52,41 @@ CenteralController::CenteralController(const Params& params):
     }
 }
 
+void
+CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
+void
+CenteralController::createPopCountDirectory(int atoms_per_block)
+{
+    fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
+                        "mode by calling either setAsyncMode or setBSPMode.")
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        for (auto mpu: mpuVector) {
+            mpu->createBSPPopCountDirectory(atoms_per_block);
+        }
+    }
+}
+
 void
 CenteralController::startup()
 {
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
+        mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
 
@@ -83,7 +115,7 @@ CenteralController::startup()
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
-        if (!mpu->running() && (mpu->workCount()> 0)) {
+        if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
@@ -104,18 +136,6 @@ CenteralController::createReadPacket(Addr addr, unsigned int size)
     return pkt;
 }
 
-void
-CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
-{
-    workload = new BFSWorkload(init_addr, init_value);
-}
-
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
-
 void
 CenteralController::recvDoneSignal()
 {
@@ -124,9 +144,25 @@ CenteralController::recvDoneSignal()
         done &= mpu->done();
     }
 
-    if (done) {
+    if (done && mode == ProcessingMode::ASYNCHRONOUS) {
         exitSimLoopNow("no update left to process.");
     }
+
+    if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        assert(state != BulkSynchronousState::DONT_CARE);
+        if (state == BulkSynchronousState::APPLYING) {
+            // TODO:
+            // 1- Toggle directories
+            // 2- Check if termination condition is met
+            // 3- If yes, schedule exit event,
+            // 4- If not switch state to consuming.
+            exitSimLoopNow("applying done.");
+        } else if (state == BulkSynchronousState::CONSUMING) {
+            // TODO:
+            // Schedule Bulk apply
+            exitSimLoopNow("consuming done.");
+        }
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ae2980d050..ab0e0c0c09 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -33,6 +33,7 @@
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "base/addr_range.hh"
 #include "params/CenteralController.hh"
@@ -46,9 +47,11 @@ class CenteralController : public ClockedObject
 {
   private:
     System* system;
-
     Addr maxVertexAddr;
 
+    ProcessingMode mode;
+    BulkSynchronousState state;
+
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
@@ -62,6 +65,11 @@ class CenteralController : public ClockedObject
     CenteralController(const CenteralControllerParams &params);
     virtual void startup() override;
 
+    void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
+    void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+
+    void createPopCountDirectory(int atoms_per_block);
+
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
     void createPRWorkload(float alpha, float threshold);
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8ac40198be..bfe3fe21b8 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -34,7 +34,6 @@
 #include "base/intmath.hh"
 #include "debug/CacheBlockState.hh"
 #include "debug/CoalesceEngine.hh"
-#include "debug/MSDebug.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -43,7 +42,7 @@ namespace gem5
 {
 
 CoalesceEngine::CoalesceEngine(const Params &params):
-    BaseMemoryEngine(params), lastAtomAddr(0),
+    BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
@@ -77,6 +76,8 @@ CoalesceEngine::registerMPU(MPU* mpu)
     owner = mpu;
 }
 
+
+// NOTE: Used for initializing memory and reading the final answer
 void
 CoalesceEngine::recvFunctional(PacketPtr pkt)
 {
@@ -85,10 +86,6 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
 
-        // FIXME: Check postPushWBQueue for hits
-        // Is it really the case though. I don't think at this time
-        // beacuse we check done after handleMemResp and make sure all
-        // the writes to memory are done before scheduling an exit event
         if ((cacheBlocks[block_index].addr == addr) &&
             (cacheBlocks[block_index].valid)) {
             assert(cacheBlocks[block_index].state == CacheState::IDLE);
@@ -100,7 +97,7 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
             memPort.sendFunctional(pkt);
         }
     } else {
-        graphWorkload->init(pkt, directory);
+        graphWorkload->init(pkt, currentDirectory);
         if (pkt->getAddr() > lastAtomAddr) {
             lastAtomAddr = pkt->getAddr();
         }
@@ -111,21 +108,46 @@ CoalesceEngine::recvFunctional(PacketPtr pkt)
 void
 CoalesceEngine::postMemInitSetup()
 {
-    directory->setLastAtomAddr(lastAtomAddr);
+    currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
 void
-CoalesceEngine::createPopCountDirectory(int atoms_per_block)
+CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
-    directory = new PopCountDirectory(
+    currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = nullptr;
+}
+
+void
+CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
+{
+    currentDirectory = new PopCountDirectory(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+    futureDirectory = new PopCountDirectroy(
+                        peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
+}
+
+void
+CoalesceEngine::swapDirectories()
+{
+    assert(currentDirectory->empty());
+    assert(currentActiveCacheBlocks.empty());
+    // assert currentDirectory is empty
+    WorkDirectory* temp = currentDirectory;
+    currentDirectory = futureDirectory;
+    futureDirectory = temp;
+
+    currentActiveCacheBlocks.clear();
+    currentActiveCacheBlocks = futureActiveCacheBlocks;
+    futureActiveCacheBlocks.clear();
 }
 
 bool
 CoalesceEngine::done()
 {
-    return memoryFunctionQueue.empty() && activeCacheBlocks.empty() &&
-        activeBuffer.empty() && directory->empty() && (onTheFlyReqs == 0);
+    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+        activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
 }
 
 bool
@@ -249,16 +271,21 @@ CoalesceEngine::recvWLRead(Addr addr)
                     // NOTE: The cache block could still be active but
                     // not dirty. If active we only have to active tracking
                     // but can throw the data away.
-                    bool atom_active = false;
+                    bool atom_active_now = false;
+                    bool atom_active_future = false;
                     for (int index = 0; index < numElementsPerLine; index++) {
-                        atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                        atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                        atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                     }
-                    if (atom_active) {
-                        activeCacheBlocks.erase(block_index);
-                        int count = directory->activate(cacheBlocks[block_index].addr);
-                        stats.blockActiveCount.sample(count);
-                        stats.frontierSize.sample(directory->workCount());
+                    if (atom_active_now) {
+                        currentActiveCacheBlocks.erase(block_index);
+                        int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                        // stats.blockActiveCount.sample(count);
+                        // stats.frontierSize.sample(directory->workCount());
+                    }
+                    if (atom_active_future) {
+                        futureActiveCacheBlocks.erase(block_index);
+                        int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -360,16 +387,21 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             // Since it is going to the cache, cache will be responsible for
             // tracking this. Push to activeCacheBlocks for simulator speed
             // instead of having to search for active blocks in the cache.
-            bool atom_active = false;
+            bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(
-                                            cacheBlocks[block_index].items[index]);
+                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
+                currentActiveCacheBlocks.push_back(block_index);
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
-                activeCacheBlocks.push_back(block_index);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+            if (atom_active_future) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = futureDirectory->deactivate(addr);
+                futureActiveCacheBlocks.push_back(block_index);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -420,15 +452,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
 
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            bool atom_active = false;
+            bool atom_active_now = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= graphWorkload->activeCondition(items[index]);
+                atom_active |= items[index].activeNow;
             }
-            if (atom_active) {
-                int count = directory->deactivate(addr);
+            if (atom_active_now) {
+                // TODO: Add sampling of blockActiveCount and frontierSize here
+                int count = currentDirectory->deactivate(addr);
                 activeBuffer.emplace_back(pkt, curTick());
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
             } else {
                 delete pkt;
             }
@@ -486,6 +519,9 @@ CoalesceEngine::processNextResponseEvent()
         stats.responseQueueLatency.sample(
                                     waiting_ticks * 1e9 / getClockFrequency());
         if (num_responses_sent >= maxRespPerCycle) {
+            // TODO: Add the condition to check that front of queue can be
+            // sent to WLEngine. i.e. it has at least been in the queue for
+            // one cycle.
             if (!responseQueue.empty()) {
                 stats.responsePortShortage++;
             }
@@ -533,12 +569,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     if (wl.tempProp != cacheBlocks[block_index].items[wl_offset].tempProp) {
         cacheBlocks[block_index].dirty |= true;
     }
+
+    bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if ((graphWorkload->activeCondition(cacheBlocks[block_index].items[wl_offset])) &&
-        (!activeCacheBlocks.find(block_index))) {
-        activeCacheBlocks.push_back(block_index);
-        if (!owner->running()) {
-            owner->start();
+    if (mode == ProcessingMode::ASYNCHRONOUS) {
+        cacheBlocks[block_index].activeNow |= active;
+        if (active && (!currentActiveCacheBlocks.find(block_index))) {
+            currentActiveCacheBlocks.push_back(block_index);
+            if (!owner->running()) {
+                owner->start();
+            }
+        }
+    }
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        cacheBlocks[block_index].activeFuture |= active;
+        if (active && (!futureActiveCacheBlocks.find(block_index))) {
+            futureActiveCacheBlocks.push_back(block_index);
         }
     }
 
@@ -565,16 +611,22 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     schedule(nextMemoryEvent, nextCycle());
                 }
             } else {
-                bool atom_active = false;
+                bool atom_active_now = false;
+                bool atom_active_future = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+                    atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
-                if (atom_active) {
-                    activeCacheBlocks.erase(block_index);
-                    int count = directory->activate(cacheBlocks[block_index].addr);
-                    stats.blockActiveCount.sample(count);
-                    stats.frontierSize.sample(directory->workCount());
+                if (atom_active_now) {
+                    // TODO: Sample frontier size and blockCount here.
+                    currentActiveCacheBlocks.erase(block_index);
+                    int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                    // stats.blockActiveCount.sample(count);
+                    // stats.frontierSize.sample(directory->workCount());
+                }
+                if (atom_active_future) {
+                    futureActiveCacheBlocks.erase(block_index);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -586,6 +638,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
+
     if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
         owner->recvDoneSignal();
     }
@@ -623,6 +676,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
+    // FIXME: done() might have a different meaning depending on
+    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -659,6 +714,16 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
+            // NOTE: If an atom is in the postPushWBQueue,
+            // the it is definitely currently not active.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             wb = postPushWBQueue.erase(wb);
@@ -677,7 +742,19 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].valid = true;
             cacheBlocks[block_index].dirty = true;
             cacheBlocks[block_index].lastChangedTick = curTick();
-            activeCacheBlocks.push_back(block_index);
+            // If an atom is in the activeBuffer,
+            // then it is definitely currently active.
+            currentActiveCacheBlocks.push_back(block_index);
+            // NOTE: Residence in the activeBuffer does not
+            // signify anything about future activity.
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementsPerLine; index++)
+            {
+                atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
 
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
@@ -767,10 +844,11 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 
         // NOTE: If the atom we're writing back is active, we have to
         // stop tracking it in the cache and start tracking it in the memory.
-        bool atom_active = false;
+        bool atom_active_now = false;
+        bool atom_active_future = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(
-                                        cacheBlocks[block_index].items[index]);
+            atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
+            atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
         }
 
         PacketPtr pkt = createWritePacket(
@@ -779,18 +857,25 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
         DPRINTF(CoalesceEngine,  "%s: Created a write packet to "
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
-        if (atom_active) {
-            activeCacheBlocks.erase(block_index);
+        if (atom_active_future) {
+            futureActiveCacheBlocks.erase(block_index);
+        }
+        if (atom_active_now) {
+            currentActiveCacheBlocks.erase(block_index);
             if (enoughSpace()) {
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
-                int count = directory->activate(cacheBlocks[block_index].addr);
-                stats.blockActiveCount.sample(count);
-                stats.frontierSize.sample(directory->workCount());
+                int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                // stats.blockActiveCount.sample(count);
+                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
+            if (atom_active_future) {
+                int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+            }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
         }
@@ -810,17 +895,24 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
 void
 CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 {
-    if (postPushWBQueue.empty()) {
-        return;
-    }
-
-    PacketPtr wb_pkt;
-    Tick pkt_tick;
-    std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
-    if (schedule_tick == pkt_tick) {
-        memPort.sendPacket(wb_pkt);
-        onTheFlyReqs++;
-        postPushWBQueue.pop_front();
+    if (!postPushWBQueue.empty()) {
+        PacketPtr wb_pkt;
+        Tick pkt_tick;
+        std::tie(wb_pkt, pkt_tick) = postPushWBQueue.front();
+        if (schedule_tick == pkt_tick) {
+            WorkListItem items[numElementsPerLine];
+            wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            bool atom_active_future = false;
+            for (int index = 0; index < numElementPerLine; index++) {
+                atom_active_future |= items[index].activeFuture;
+            }
+            if (atom_active_future) {
+                futureDirectory->activate(wb_pkt->getAddr());
+            }
+            memPort.sendPacket(wb_pkt);
+            onTheFlyReqs++;
+            postPushWBQueue.pop_front();
+        }
     }
 }
 
@@ -828,8 +920,8 @@ void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
     pullsScheduled--;
-    if (!directory->empty()) {
-        Addr addr = directory->getNextWork();
+    if (!currentDirectory->empty()) {
+        Addr addr = currentDirectory->getNextWork();
         int block_index = getBlockIndex(addr);
 
         bool in_cache = cacheBlocks[block_index].addr == addr;
@@ -875,8 +967,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() +
-            directory->workCount() + activeBuffer.size();
+    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -905,9 +996,10 @@ CoalesceEngine::processNextApplyEvent()
         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
 
         for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-            if (graphWorkload->activeCondition(items[index])) {
+            if (items[index].activeNow) {
                 Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
                 uint32_t delta = graphWorkload->apply(items[index]);
+                items[index].activeNow = false;
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
                 pullsReceived--;
@@ -919,12 +1011,12 @@ CoalesceEngine::processNextApplyEvent()
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, peerMemoryAtomSize);
 
-        bool atom_active = false;
+        bool atom_active_now = false;
         for (int index = 0; index < numElementsPerLine; index++) {
-            atom_active |= graphWorkload->activeCondition(items[index]);
+            atom_active_now |= items[index].activeNow;
         }
         // NOTE: If the atom is not active anymore.
-        if (!atom_active) {
+        if (!atom_active_now) {
             PacketPtr wb_pkt = createWritePacket(pkt->getAddr(),
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
@@ -946,9 +1038,10 @@ CoalesceEngine::processNextApplyEvent()
             int block_index = activeCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
-                    if (graphWorkload->activeCondition(cacheBlocks[block_index].items[index])) {
+                    if (cacheBlocks[block_index].items[index].activeNow) {
                         Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
                         uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
+                        cacheBlocks[block_index].items[index].activeNow = false;
                         cacheBlocks[block_index].dirty = true;
                         owner->recvVertexPush(addr, delta,
                             cacheBlocks[block_index].items[index].edgeIndex,
@@ -959,20 +1052,20 @@ CoalesceEngine::processNextApplyEvent()
                     }
                 }
 
-                bool atom_active = false;
+                bool atom_active_now = false;
                 for (int index = 0; index < numElementsPerLine; index++) {
-                    atom_active |= graphWorkload->activeCondition(cacheBlocks[block_index].items[index]);
+                    atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 }
                 // NOTE: If we have reached the last item in the cache block
-                if (!atom_active) {
-                    activeCacheBlocks.erase(block_index);
+                if (!atom_active_now) {
+                    currentActiveCacheBlocks.erase(block_index);
                 }
                 break;
             }
             // NOTE: If the block with index at the front of activeCacheBlocks
             // is not in IDLE state, then roll the that index to the back
-            activeCacheBlocks.pop_front();
-            activeCacheBlocks.push_back(block_index);
+            currentActiveCacheBlocks.pop_front();
+            currentActiveCacheBlocks.push_back(block_index);
             // NOTE: If we have visited all the items initially in the FIFO.
             num_visited_indices++;
             if (num_visited_indices == initial_fifo_length) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f605704b6d..39f2491232 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -96,7 +96,9 @@ class CoalesceEngine : public BaseMemoryEngine
     };
 
     MPU* owner;
-    WorkDirectory* directory;
+    ProcessingMode mode;
+    WorkDirectory* currentDirectory;
+    WorkDirectory* futureDirectory;
     GraphWorkload* graphWorkload;
 
     Addr lastAtomAddr;
@@ -114,8 +116,9 @@ class CoalesceEngine : public BaseMemoryEngine
 
     // Tracking work in cache
     int pullsReceived;
-    // NOTE: Remember to erase from this upon eviction from cache
-    UniqueFIFO<int> activeCacheBlocks;
+    // NOTE: Remember to erase from these upon eviction from cache
+    UniqueFIFO<int> currentActiveCacheBlocks;
+    UniqueFIFO<int> futureActiveCacheBlocks;
 
     int pullsScheduled;
     int pendingPullLimit;
@@ -195,12 +198,14 @@ class CoalesceEngine : public BaseMemoryEngine
     CoalesceEngine(const Params &params);
     void registerMPU(MPU* mpu);
 
+    void setProcessingMode(ProcessingMode _mode) { mode = _mode; }
+    void createAsyncPopCountDirectory(int atoms_per_block);
+    void createBSPPopCountDirectory(int atoms_per_block);
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
-    virtual void recvFunctional(PacketPtr pkt);
 
+    virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
-
-    void createPopCountDirectory(int atoms_per_block);
+    void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index de5d569c18..83f3033427 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -39,7 +39,6 @@ const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "PENDING_WB"
 };
 
-
 const char* readReturnStatusStrings[NUM_READ_RETURN_STATUS] =
 {
     "ACCEPT",
@@ -53,4 +52,18 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] =
     "READ_FOR_PUSH"
 };
 
+const char* processingModeStrings[NUM_PROCESSING_MODE] =
+{
+    "NOT_SET",
+    "ASYNCHRONOUS",
+    "BULK_SYNCHRONOUS"
+};
+
+const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
+{
+    "NOT_SET",
+    "CONSUMING",
+    "APPLYING"
+};
+
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 6153386b71..f6d199bf7d 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -60,6 +60,24 @@ enum ReadDestination
 };
 extern const char* readDestinationStrings[NUM_READ_DESTINATION];
 
+enum ProcessingMode
+{
+    NOT_SET,
+    ASYNCHRONOUS,
+    BULK_SYNCHRONOUS,
+    NUM_PROCESSING_MODE
+};
+extern const char* processingModeStrings[NUM_PROCESSING_MODE];
+
+enum BulkSynchronousStates
+{
+    NOT_SET,
+    CONSUMING,
+    APPLYING,
+    NUM_BULK_SYNCHRONOUS_STATE,
+}
+extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
+
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index ad18a0d5a5..358394ffc5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -63,6 +63,10 @@ class MPU : public SimObject
     MPU(const Params& params);
     void registerCenteralController(CenteralController* centeral_controller);
 
+    void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }

From 454e1e3a81c2818ea532183335fd94e731899326 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:12:57 -0800
Subject: [PATCH 224/287] Fixing enums

---
 src/accl/graph/sega/centeral_controller.cc | 5 ++++-
 src/accl/graph/sega/enums.hh               | 2 +-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6c924a4703..6e5f3ffcec 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -113,6 +113,9 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
+    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
+        state = BulkSynchronousStates::CONSUMING;
+    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -149,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
+        assert(state != BulkSynchronousState::NOT_SET);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index f6d199bf7d..8280f122c3 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -75,7 +75,7 @@ enum BulkSynchronousStates
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,
-}
+};
 extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
 
 } // namespace gem5

From f4b8685a29d80717374c2d222bfc96e5cec25266 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:15:52 -0800
Subject: [PATCH 225/287] Further fixes for enums.

---
 src/accl/graph/sega/centeral_controller.cc | 4 ++--
 src/accl/graph/sega/enums.cc               | 2 +-
 src/accl/graph/sega/enums.hh               | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6e5f3ffcec..c6b9cf7a52 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -44,7 +44,7 @@ CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
     mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::NOT_SET)
+    state(BulkSynchronousState::DONT_CARE)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -152,7 +152,7 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::NOT_SET);
+        assert(state != BulkSynchronousState::DONT_CARE);
         if (state == BulkSynchronousState::APPLYING) {
             // TODO:
             // 1- Toggle directories
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 83f3033427..099594e9eb 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -61,7 +61,7 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
 
 const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
 {
-    "NOT_SET",
+    "DONT_CARE",
     "CONSUMING",
     "APPLYING"
 };
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 8280f122c3..4c94412c9b 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -71,7 +71,7 @@ extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
 enum BulkSynchronousStates
 {
-    NOT_SET,
+    DONT_CARE,
     CONSUMING,
     APPLYING,
     NUM_BULK_SYNCHRONOUS_STATE,

From c3fd13291d5a4ecf5e43713888a4de11769b05a4 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:21:12 -0800
Subject: [PATCH 226/287] Fixing typos

---
 src/accl/graph/sega/enums.hh | 2 +-
 src/accl/graph/sega/mpu.hh   | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 4c94412c9b..969ee8a976 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,7 +69,7 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousStates
+enum BulkSynchronousState
 {
     DONT_CARE,
     CONSUMING,
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 358394ffc5..7d75e3e0b7 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -64,8 +64,8 @@ class MPU : public SimObject
     void registerCenteralController(CenteralController* centeral_controller);
 
     void setProcessingMode(ProcessingMode mode) { coalesceEngine->setProcessingMode(mode); }
-    void createAsyncPopCountDirectory(int atoms_per_block) { coalseceEngine->createAsyncPopCountDirectory(atoms_per_block); }
-    void createBSPPopCountDirectory(int atoms_per_block) { coalseceEngine->createBSPPopCountDirectory(atoms_per_block); }
+    void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
+    void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }

From 513e3f6beb77eb97902f9c0eafd5791b4dc9dcff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:31:55 -0800
Subject: [PATCH 227/287] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 21 +++++++++++++--------
 2 files changed, 14 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c6b9cf7a52..df1abbedc3 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -114,7 +114,7 @@ CenteralController::startup()
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousStates::CONSUMING;
+        state = BulkSynchronousState::CONSUMING;
     }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index bfe3fe21b8..6efafbb76c 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -124,7 +124,7 @@ CoalesceEngine::createBSPPopCountDirectory(int atoms_per_block)
 {
     currentDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
-    futureDirectory = new PopCountDirectroy(
+    futureDirectory = new PopCountDirectory(
                         peerMemoryRange, atoms_per_block, peerMemoryAtomSize);
 }
 
@@ -390,7 +390,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             bool atom_active_now = false;
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active_now |= cacheBlocks[block_inde].items[index].activeNow;
+                atom_active_now |= cacheBlocks[block_index].items[index].activeNow;
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
@@ -453,12 +453,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             WorkListItem items[numElementsPerLine];
             pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_now = false;
+            bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++) {
-                atom_active |= items[index].activeNow;
+                atom_active_now |= items[index].activeNow;
+                atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
                 // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->deactivate(addr);
+                }
                 activeBuffer.emplace_back(pkt, curTick());
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
@@ -573,7 +578,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (mode == ProcessingMode::ASYNCHRONOUS) {
-        cacheBlocks[block_index].activeNow |= active;
+        cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!currentActiveCacheBlocks.find(block_index))) {
             currentActiveCacheBlocks.push_back(block_index);
             if (!owner->running()) {
@@ -582,7 +587,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
         }
     }
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        cacheBlocks[block_index].activeFuture |= active;
+        cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
         if (active && (!futureActiveCacheBlocks.find(block_index))) {
             futureActiveCacheBlocks.push_back(block_index);
         }
@@ -903,7 +908,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
             WorkListItem items[numElementsPerLine];
             wb_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
             bool atom_active_future = false;
-            for (int index = 0; index < numElementPerLine; index++) {
+            for (int index = 0; index < numElementsPerLine; index++) {
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_future) {
@@ -967,7 +972,7 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return activeCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
@@ -1031,7 +1036,7 @@ CoalesceEngine::processNextApplyEvent()
             }
             delete pkt;
         }
-    } else if (!activeCacheBlocks.empty()) {
+    } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
         int initial_fifo_length = activeCacheBlocks.size();
         while (true) {

From d9ae6bed35e40240d7f6c80eb4c37b816099885d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:41:42 -0800
Subject: [PATCH 228/287] Fixing typos.

---
 src/accl/graph/sega/centeral_controller.cc | 2 +-
 src/accl/graph/sega/coalesce_engine.cc     | 4 ++--
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index df1abbedc3..db0f7941ed 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -68,7 +68,7 @@ void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
     fatal_if(mode == ProcessingMode::NOT_SET, "You should set the processing "
-                        "mode by calling either setAsyncMode or setBSPMode.")
+                        "mode by calling either setAsyncMode or setBSPMode.");
     if (mode == ProcessingMode::ASYNCHRONOUS) {
         for (auto mpu: mpuVector) {
             mpu->createAsyncPopCountDirectory(atoms_per_block);
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6efafbb76c..e3c194566a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,9 +1038,9 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = activeCacheBlocks.size();
+        int initial_fifo_length = crrentActiveCacheBlocks.size();
         while (true) {
-            int block_index = activeCacheBlocks.front();
+            int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
                     if (cacheBlocks[block_index].items[index].activeNow) {

From 37ec3ddacd9e25127f5ee90a7341956549bc731d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 13:54:47 -0800
Subject: [PATCH 229/287] Debug.

---
 src/accl/graph/base/graph_workload.cc      | 74 +++++++++++++++++++++-
 src/accl/graph/base/graph_workload.hh      | 36 +++++------
 src/accl/graph/sega/centeral_controller.cc | 10 +--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 src/accl/graph/sega/coalesce_engine.cc     |  2 +-
 5 files changed, 97 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index e36c074da9..a78b3c1526 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -92,9 +92,9 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 }
 
 bool
-BFSWorkload::activeCondition(WorkListItem wl)
+BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
 }
 
 uint32_t
@@ -298,6 +298,76 @@ PRWorkload::printWorkListItem(const WorkListItem wl)
             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
             temp_float, prop_float, wl.degree, wl.edgeIndex);
 }
+// void
+// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//     WorkListItem items[num_elements];
+
+//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+//     bool atom_active = false;
+//     for (int i = 0; i < num_elements; i++) {
+//         items[i].tempProp = readFromFloat<uint32_t>(0);
+//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
+//         atom_active |= activeCondition(items[i]);
+//     }
+//     if (atom_active) {
+//         dir->activate(pkt->getAddr());
+//     }
+//     pkt->deleteData();
+//     pkt->allocate();
+//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+// }
+
+// uint32_t
+// PRWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     float update_float = writeToFloat<uint32_t>(update);
+//     float value_float = writeToFloat<uint32_t>(value);
+//     return readFromFloat<uint32_t>(update_float + value_float);
+// }
+
+// uint32_t
+// PRWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     float value_float = writeToFloat<uint32_t>(value);
+//     float weight_float = writeToFloat<uint32_t>(weight);
+//     if (weight == 0) {
+//         weight_float = 1.0;
+//     }
+//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+// }
+
+// bool
+// PRWorkload::activeCondition(WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float dist = std::abs(temp_float - prop_float);
+//     return (dist >= threshold) && (wl.degree > 0);
+// }
+
+// uint32_t
+// PRWorkload::apply(WorkListItem& wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     float delta = (temp_float - prop_float) / wl.degree;
+//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
+//     wl.prop = wl.tempProp;
+//     return delta_uint;
+// }
+
+// std::string
+// PRWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+//     float prop_float = writeToFloat<uint32_t>(wl.prop);
+//     return csprintf(
+//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
+//             temp_float, prop_float, wl.degree, wl.edgeIndex);
+// }
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 14a6561ae3..8e27d16bf9 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -72,7 +72,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
@@ -119,26 +119,26 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-class PRWorkload : public GraphWorkload
-{
-  private:
-    float alpha;
-    float threshold;
+// class PRWorkload : public GraphWorkload
+// {
+//   private:
+//     float alpha;
+//     float threshold;
 
-  public:
-    PRWorkload(float alpha, float threshold):
-        alpha(alpha), threshold(threshold)
-    {}
+//   public:
+//     PRWorkload(float alpha, float threshold):
+//         alpha(alpha), threshold(threshold)
+//     {}
 
-    ~PRWorkload() {}
+//     ~PRWorkload() {}
 
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index db0f7941ed..7de6f61b56 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -58,11 +58,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-void
-CenteralController::createPRWorkload(float alpha, float threshold)
-{
-    workload = new PRWorkload(alpha, threshold);
-}
+// void
+// CenteralController::createPRWorkload(float alpha, float threshold)
+// {
+//     workload = new PRWorkload(alpha, threshold);
+// }
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab0e0c0c09..b32dc38385 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -71,7 +71,7 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    void createPRWorkload(float alpha, float threshold);
+    // void createPRWorkload(float alpha, float threshold);
 
     void recvDoneSignal();
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index e3c194566a..6b44f7395b 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1038,7 +1038,7 @@ CoalesceEngine::processNextApplyEvent()
         }
     } else if (!currentActiveCacheBlocks.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = crrentActiveCacheBlocks.size();
+        int initial_fifo_length = currentActiveCacheBlocks.size();
         while (true) {
             int block_index = currentActiveCacheBlocks.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {

From 4abd1cd5ec0e131cd56a741395e7ffe1bcdb2dd0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:06:44 -0800
Subject: [PATCH 230/287] Debugging.

---
 src/accl/graph/base/graph_workload.cc     | 8 +++++---
 src/accl/graph/sega/CenteralController.py | 2 +-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index a78b3c1526..50024965a1 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -67,12 +67,14 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem items[num_elements];
 
         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = initValue;
+        if (activeCondition(new_wl, items[index])) {
             dir->activate(aligned_addr);
         }
+        items[index] = new_wl;
+
         pkt->deleteData();
         pkt->allocate();
         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 8b43c90102..6de9e03a1c 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,6 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    PyBindMethod("createPRWorkload"),
+                    # PyBindMethod("createPRWorkload"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]

From 32a0f813e93accd59bd0f8d70430d9d5972d6317 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:08:17 -0800
Subject: [PATCH 231/287] Typos.

---
 src/accl/graph/base/graph_workload.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 50024965a1..9c21a3932a 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -96,7 +96,7 @@ BFSWorkload::propagate(uint32_t value, uint32_t weight)
 bool
 BFSWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree; > 0);
+    return (new_wl.tempProp < old_wl.tempProp) && (old_wl.degree > 0);
 }
 
 uint32_t

From 1352e207854c3f38670358efa991967ecb0a3089 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 12 Nov 2022 14:14:41 -0800
Subject: [PATCH 232/287] Debugging.

---
 src/accl/graph/base/graph_workload.cc | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 9c21a3932a..8536c2bbd8 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -71,6 +71,7 @@ BFSWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = initValue;
         if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
             dir->activate(aligned_addr);
         }
         items[index] = new_wl;

From f13057c8ad23d2c91203cf2ac151ce3cd54f4169 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 00:58:54 -0800
Subject: [PATCH 233/287] Finalizing bsp and pr.

---
 configs/accl/bfs.py                        |   3 +-
 configs/accl/pr.py                         |  28 +++--
 configs/accl/sega.py                       |  22 ++--
 configs/accl/sega_simple.py                |  21 ++--
 src/accl/graph/base/graph_workload.cc      | 131 ++++++---------------
 src/accl/graph/base/graph_workload.hh      |  34 +++---
 src/accl/graph/sega/CenteralController.py  |   3 +-
 src/accl/graph/sega/centeral_controller.cc |  46 ++++----
 src/accl/graph/sega/centeral_controller.hh |   4 +-
 src/accl/graph/sega/coalesce_engine.cc     |  63 ++++++++++
 src/accl/graph/sega/coalesce_engine.hh     |   2 +
 src/accl/graph/sega/enums.cc               |   7 --
 src/accl/graph/sega/enums.hh               |   9 --
 src/accl/graph/sega/mpu.hh                 |   2 +
 14 files changed, 193 insertions(+), 182 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 806aa8a915..ab5de485b1 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -88,7 +88,7 @@ def get_inputs():
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,6 +98,7 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_async_mode()
     system.create_pop_count_directory(64)
     system.create_bfs_workload(init_addr, init_value)
     if sample:
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index e3d7c764ad..ea8a103640 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -35,9 +35,9 @@ def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
-    argparser.add_argument("threshold", type=float)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -69,8 +69,8 @@ def get_inputs():
         args.num_gpts,
         args.cache_size,
         args.graph,
+        args.iterations,
         args.alpha,
-        args.threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -82,13 +82,13 @@ def get_inputs():
         num_gpts,
         cache_size,
         graph,
+        iterations,
         alpha,
-        threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
-    
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -98,8 +98,9 @@ def get_inputs():
 
     m5.instantiate()
 
+    system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha, threshold)
+    system.create_pr_workload(alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -112,11 +113,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
     if verify:
         system.print_answer()
-
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 1ea36ea49e..07e1b36d9d 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -73,8 +73,8 @@ def __init__(
         )
 
         self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=
+            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
         self.push_engine.mem_port = self.edge_mem_ctrl.port
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+            gpt = GPT("16GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
@@ -139,15 +139,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index f59fa71a79..8727a4c90d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -66,9 +66,9 @@ def __init__(
             max_propagates_per_cycle=8,
             update_queue_size=32,
         )
-        
+
         self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-        
+
         self.edge_mem_ctrl = MemCtrl(
             dram=DDR4_2400_8x8(
                 range=AddrRange(edge_memory_size), in_addr_map=False)
@@ -129,16 +129,23 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
     def create_pop_count_directory(self, atoms_per_block):
-        for gpt in self.gpts:
-            gpt.coalesce_engine.createPopCountDirectory(atoms_per_block)
+        self.ctrl.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
-    def create_pr_workload(self, alpha, threshold):
-        self.ctrl.createPRWorkload(alpha, threshold)
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
 
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
-
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 8536c2bbd8..1fa2b287c4 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -111,9 +111,11 @@ std::string
 BFSWorkload::printWorkListItem(const WorkListItem wl)
 {
     return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+            "WorkListItem{tempProp: %u, prop: %u, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
 }
 
 void
@@ -232,7 +234,7 @@ SSSPWorkload::printWorkListItem(const WorkListItem wl)
 
 
 void
-PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
     size_t pkt_size = pkt->getSize();
     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
@@ -241,9 +243,12 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = readFromFloat<uint32_t>(0);
-        items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(items[i]);
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
+        items[i] = new_wl;
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
@@ -254,7 +259,7 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 }
 
 uint32_t
-PRWorkload::reduce(uint32_t update, uint32_t value)
+BSPPRWorkload::reduce(uint32_t update, uint32_t value)
 {
     float update_float = writeToFloat<uint32_t>(update);
     float value_float = writeToFloat<uint32_t>(value);
@@ -262,115 +267,47 @@ PRWorkload::reduce(uint32_t update, uint32_t value)
 }
 
 uint32_t
-PRWorkload::propagate(uint32_t value, uint32_t weight)
+BSPPRWorkload::propagate(uint32_t value, uint32_t weight)
 {
     float value_float = writeToFloat<uint32_t>(value);
-    float weight_float = writeToFloat<uint32_t>(weight);
-    if (weight == 0) {
-        weight_float = 1.0;
-    }
-    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+    return readFromFloat<uint32_t>(alpha * value_float);
 }
 
 bool
-PRWorkload::activeCondition(WorkListItem wl)
+BSPPRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-    float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float dist = std::abs(temp_float - prop_float);
-    return (dist >= threshold) && (wl.degree > 0);
+    return (old_wl.degree > 0);
 }
 
 uint32_t
-PRWorkload::apply(WorkListItem& wl)
+BSPPRWorkload::apply(WorkListItem& wl)
 {
-    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
-    float delta = (temp_float - prop_float) / wl.degree;
+    float delta = prop_float / wl.degree;
     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-    wl.prop = wl.tempProp;
     return delta_uint;
 }
 
+void
+BSPPRWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.activeFuture = (wl.degree > 0);
+}
+
 std::string
-PRWorkload::printWorkListItem(const WorkListItem wl)
+BSPPRWorkload::printWorkListItem(const WorkListItem wl)
 {
     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     return csprintf(
-            "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-            temp_float, prop_float, wl.degree, wl.edgeIndex);
-}
-// void
-// PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//     WorkListItem items[num_elements];
-
-//     pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-//     bool atom_active = false;
-//     for (int i = 0; i < num_elements; i++) {
-//         items[i].tempProp = readFromFloat<uint32_t>(0);
-//         items[i].prop = readFromFloat<uint32_t>(1 - alpha);
-//         atom_active |= activeCondition(items[i]);
-//     }
-//     if (atom_active) {
-//         dir->activate(pkt->getAddr());
-//     }
-//     pkt->deleteData();
-//     pkt->allocate();
-//     pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-// }
-
-// uint32_t
-// PRWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     float update_float = writeToFloat<uint32_t>(update);
-//     float value_float = writeToFloat<uint32_t>(value);
-//     return readFromFloat<uint32_t>(update_float + value_float);
-// }
-
-// uint32_t
-// PRWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     float value_float = writeToFloat<uint32_t>(value);
-//     float weight_float = writeToFloat<uint32_t>(weight);
-//     if (weight == 0) {
-//         weight_float = 1.0;
-//     }
-//     return readFromFloat<uint32_t>(alpha * value_float * weight_float);
-// }
-
-// bool
-// PRWorkload::activeCondition(WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float dist = std::abs(temp_float - prop_float);
-//     return (dist >= threshold) && (wl.degree > 0);
-// }
-
-// uint32_t
-// PRWorkload::apply(WorkListItem& wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     float delta = (temp_float - prop_float) / wl.degree;
-//     uint32_t delta_uint = readFromFloat<uint32_t>(delta);
-//     wl.prop = wl.tempProp;
-//     return delta_uint;
-// }
-
-// std::string
-// PRWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     float temp_float = writeToFloat<uint32_t>(wl.tempProp);
-//     float prop_float = writeToFloat<uint32_t>(wl.prop);
-//     return csprintf(
-//             "WorkListItem{tempProp: %f, prop: %f, degree: %u, edgeIndex: %u}",
-//             temp_float, prop_float, wl.degree, wl.edgeIndex);
-// }
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
 
 void
 CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 8e27d16bf9..fdd4928e10 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
 };
@@ -72,6 +73,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
@@ -119,26 +121,24 @@ class SSSPWorkload : public GraphWorkload
 };
 
 
-// class PRWorkload : public GraphWorkload
-// {
-//   private:
-//     float alpha;
-//     float threshold;
+class BSPPRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
 
-//   public:
-//     PRWorkload(float alpha, float threshold):
-//         alpha(alpha), threshold(threshold)
-//     {}
+  public:
+    BSPPRWorkload(float alpha): alpha(alpha) {}
 
-//     ~PRWorkload() {}
+    ~BSPPRWorkload() {}
 
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 class CCWorkload : public GraphWorkload
 {
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 6de9e03a1c..9dd8f41e61 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
-                    # PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createPRWorkload"),
+                    PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7de6f61b56..0103b1a0c4 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,8 +43,7 @@ namespace gem5
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
-    mode(ProcessingMode::NOT_SET),
-    state(BulkSynchronousState::DONT_CARE)
+    mode(ProcessingMode::NOT_SET)
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -58,11 +57,11 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
-// void
-// CenteralController::createPRWorkload(float alpha, float threshold)
-// {
-//     workload = new PRWorkload(alpha, threshold);
-// }
+void
+CenteralController::createPRWorkload(float alpha)
+{
+    workload = new BSPPRWorkload(alpha);
+}
 
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
@@ -113,9 +112,6 @@ CenteralController::startup()
 
     panic_if(!image.write(proxy), "%s: Unable to write image.");
 
-    if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        state = BulkSynchronousState::CONSUMING;
-    }
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -152,20 +148,25 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::BULK_SYNCHRONOUS) {
-        assert(state != BulkSynchronousState::DONT_CARE);
-        if (state == BulkSynchronousState::APPLYING) {
-            // TODO:
-            // 1- Toggle directories
-            // 2- Check if termination condition is met
-            // 3- If yes, schedule exit event,
-            // 4- If not switch state to consuming.
-            exitSimLoopNow("applying done.");
-        } else if (state == BulkSynchronousState::CONSUMING) {
-            // TODO:
-            // Schedule Bulk apply
-            exitSimLoopNow("consuming done.");
+        for (auto mpu: mpuVector) {
+            mpu->postConsumeProcess();
+            mpu->swapDirectories();
+            if (!mpu->running() && (mpu->workCount() > 0)) {
+                mpu->start();
+            }
         }
+        exitSimLoopNow("finished an iteration.");
+    }
+}
+
+int
+CenteralController::workCount()
+{
+    int work_count = 0;
+    for (auto mpu: mpuVector) {
+        work_count += mpu->workCount();
     }
+    return work_count;
 }
 
 void
@@ -184,7 +185,6 @@ CenteralController::printAnswerToHostSimout()
         }
         pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
         for (int i = 0; i < num_items; i++) {
-            workload->apply(items[i]);
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index b32dc38385..ab039e5024 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -50,7 +50,6 @@ class CenteralController : public ClockedObject
     Addr maxVertexAddr;
 
     ProcessingMode mode;
-    BulkSynchronousState state;
 
     std::vector<MPU*> mpuVector;
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
@@ -71,10 +70,11 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
-    // void createPRWorkload(float alpha, float threshold);
+    void createPRWorkload(float alpha);
 
     void recvDoneSignal();
 
+    int workCount();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 6b44f7395b..32b946d29f 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -111,6 +111,69 @@ CoalesceEngine::postMemInitSetup()
     currentDirectory->setLastAtomAddr(lastAtomAddr);
 }
 
+void
+CoalesceEngine::postConsumeProcess()
+{
+    WorkListItem items[numElementsPerLine];
+    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+        int block_index = getBlockIndex(addr);
+        if (cacheBlocks[block_index].addr == addr) {
+            assert(cacheBlocks[block_index].valid);
+            assert(!cacheBlocks[block_index].hasConflict);
+            assert(cacheBlocks[block_index].state == CacheState::IDLE);
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
+                // if (cacheBlocks[block_index].items[index].activeFuture) {
+                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                //     cacheBlocks[block_index].items[index].activeNow = true;
+                //     cacheBlocks[block_index].items[index].activeFuture = false;
+                // }
+                atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
+                graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
+                atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;
+                if (cacheBlocks[block_index].items[index].activeFuture) {
+                    cacheBlocks[block_index].items[index].activeFuture = false;
+                    cacheBlocks[block_index].items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureActiveCacheBlocks.push_back(block_index);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureActiveCacheBlocks.erase(block_index);
+            }
+        } else {
+            PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
+            memPort.sendFunctional(read_pkt);
+            read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
+            delete read_pkt;
+            bool atom_active_future_before = false;
+            bool atom_active_future_after = false;
+            for (int index = 0; index < numElementsPerLine; index++) {
+                assert(!items[index].activeNow);
+                atom_active_future_before |= items[index].activeFuture;
+                graphWorkload->interIterationInit(items[index]);
+                atom_active_future_after |= items[index].activeFuture;
+                if (items[index].activeFuture) {
+                    items[index].activeFuture = false;
+                    items[index].activeNow = true;
+                }
+            }
+            if (!atom_active_future_before && atom_active_future_after) {
+                futureDirectory->activate(addr);
+            }
+            if (atom_active_future_before && !atom_active_future_after) {
+                futureDirectory->deactivate(addr);
+            }
+            PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
+            memPort.sendFunctional(write_pkt);
+            delete write_pkt;
+        }
+    }
+}
+
 void
 CoalesceEngine::createAsyncPopCountDirectory(int atoms_per_block)
 {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 39f2491232..c9d8e47f15 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -205,12 +205,14 @@ class CoalesceEngine : public BaseMemoryEngine
 
     virtual void recvFunctional(PacketPtr pkt);
     void postMemInitSetup();
+    void postConsumeProcess();
     void swapDirectories();
 
     ReadReturnStatus recvWLRead(Addr addr);
     void recvWLWrite(Addr addr, WorkListItem wl);
 
     int workCount();
+    int futureWorkCount();
     void recvVertexPull();
 
     bool done();
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 099594e9eb..f7ef96197f 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -59,11 +59,4 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
     "BULK_SYNCHRONOUS"
 };
 
-const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE] =
-{
-    "DONT_CARE",
-    "CONSUMING",
-    "APPLYING"
-};
-
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 969ee8a976..f97c33a0e0 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -69,15 +69,6 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum BulkSynchronousState
-{
-    DONT_CARE,
-    CONSUMING,
-    APPLYING,
-    NUM_BULK_SYNCHRONOUS_STATE,
-};
-extern const char* bulkSynchronousStateStrings[NUM_BULK_SYNCHRONOUS_STATE];
-
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 7d75e3e0b7..04393db36d 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -70,6 +70,8 @@ class MPU : public SimObject
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
+    void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
+    void swapDirectories() { coalesceEngine->swapDirectories(); }
 
     bool handleIncomingUpdate(PacketPtr pkt);
 

From f59afb8fb699e6ae63af78d6e4dfc165696c319f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 11:17:39 -0800
Subject: [PATCH 234/287] Fixing a bug in async mode.

---
 configs/accl/sega.py                       |  2 +-
 configs/accl/sega_simple.py                |  2 +-
 src/accl/graph/sega/CenteralController.py  |  3 ++-
 src/accl/graph/sega/centeral_controller.cc | 10 +++++-----
 src/accl/graph/sega/coalesce_engine.cc     |  6 +++---
 5 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 07e1b36d9d..b5ce618f7f 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -124,7 +124,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("16GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(
                 [vertex_ranges[i], vertex_ranges[i + num_mpus]]
             )
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 8727a4c90d..ff97134b47 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -117,7 +117,7 @@ def __init__(self, num_mpus, cache_size, graph_path):
 
         gpts = []
         for i in range(num_mpus):
-            gpt = GPT("4GiB", cache_size)
+            gpt = GPT("2GiB", cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
             gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
             gpts.append(gpt)
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 9dd8f41e61..f9544ec539 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -37,7 +37,8 @@ class CenteralController(ClockedObject):
 
     system = Param.System(Parent.any, "System this Engine is a part of")
 
-    image_file = Param.String("Path to the vertex image file.")
+    vertex_image_file = Param.String("Path to the vertex image file.")
+    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0103b1a0c4..c44789f9f0 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -89,7 +89,7 @@ CenteralController::startup()
         mpu->recvWorkload(workload);
     }
 
-    const auto& file = params().image_file;
+    const auto& vertex_file = params().vertex_image_file;
     if (file == "")
         return;
 
@@ -97,10 +97,10 @@ CenteralController::startup()
     fatal_if(!object, "%s: Could not load %s.", name(), file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
-    loader::MemoryImage image = object->buildImage();
-    maxVertexAddr = image.maxAddr();
+    loader::MemoryImage vertex_image = object->buildImage();
+    maxVertexAddr = vertex_image.maxAddr();
 
-    PortProxy proxy(
+    PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
@@ -110,7 +110,7 @@ CenteralController::startup()
         }
     }, system->cacheLineSize());
 
-    panic_if(!image.write(proxy), "%s: Unable to write image.");
+    panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 32b946d29f..35b2bf71cf 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -744,8 +744,6 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    // FIXME: done() might have a different meaning depending on
-    // ProcessingMode and Processing state
     if (done()) {
         owner->recvDoneSignal();
     }
@@ -934,7 +932,9 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                if (atom_active_future) {
+                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                }
                 // stats.blockActiveCount.sample(count);
                 // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);

From 772795067298f974d713a6a605b0056e30bfe537 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 16:03:25 -0800
Subject: [PATCH 235/287] Debugging and removing typos. sega-ddr represent
 correct system config.

---
 configs/accl/sega-ddr/bfs.py               | 125 +++++++++++++
 configs/accl/sega-ddr/pr.py                | 128 +++++++++++++
 configs/accl/sega-ddr/sega.py              | 200 +++++++++++++++++++++
 src/accl/graph/sega/CenteralController.py  |   1 -
 src/accl/graph/sega/centeral_controller.cc |   6 +-
 src/accl/graph/sega/coalesce_engine.cc     |  68 ++++---
 src/accl/graph/sega/coalesce_engine.hh     |   9 +-
 7 files changed, 505 insertions(+), 32 deletions(-)
 create mode 100644 configs/accl/sega-ddr/bfs.py
 create mode 100644 configs/accl/sega-ddr/pr.py
 create mode 100644 configs/accl/sega-ddr/sega.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
new file mode 100644
index 0000000000..8766822b33
--- /dev/null
+++ b/configs/accl/sega-ddr/bfs.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_bfs_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/sega-ddr/pr.py
new file mode 100644
index 0000000000..ea8a103640
--- /dev/null
+++ b/configs/accl/sega-ddr/pr.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.cache_size,
+        args.graph,
+        args.iterations,
+        args.alpha,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        cache_size,
+        graph,
+        iterations,
+        alpha,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_bsp_mode()
+    system.create_pop_count_directory(64)
+    system.create_pr_workload(alpha)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        iteration = 0
+        while iteration < iterations:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            iteration += 1
+            if system.work_count() == 0:
+                break
+    print(f"#iterations: {iteration}")
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
new file mode 100644
index 0000000000..c5545ee0f1
--- /dev/null
+++ b/configs/accl/sega-ddr/sega.py
@@ -0,0 +1,200 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret, intlv_low_bit + intlv_bits - 1
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=32,
+            active_buffer_size=64,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = HBMCtrl(
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_ranges):
+        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
+        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
+
+    def set_vertex_pch_bit(self, pch_bit):
+        self.vertex_mem_ctrl.pch_bit = pch_bit
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges, pch_bit = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
+        )
+        gpts = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
+            gpt.set_vertex_pch_bit(pch_bit)
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f9544ec539..bda2fa3d6a 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -38,7 +38,6 @@ class CenteralController(ClockedObject):
     system = Param.System(Parent.any, "System this Engine is a part of")
 
     vertex_image_file = Param.String("Path to the vertex image file.")
-    edgelist_image_file = Param.String("Path to the edgelist image file.")
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index c44789f9f0..26e4473b03 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -90,11 +90,11 @@ CenteralController::startup()
     }
 
     const auto& vertex_file = params().vertex_image_file;
-    if (file == "")
+    if (vertex_file == "")
         return;
 
-    auto* object = loader::createObjectFile(file, true);
-    fatal_if(!object, "%s: Could not load %s.", name(), file);
+    auto* object = loader::createObjectFile(vertex_file, true);
+    fatal_if(!object, "%s: Could not load %s.", name(), vertex_file);
 
     loader::debugSymbolTable.insert(*object->symtab().globals());
     loader::MemoryImage vertex_image = object->buildImage();
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 35b2bf71cf..263e08d901 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -343,12 +343,14 @@ CoalesceEngine::recvWLRead(Addr addr)
                     if (atom_active_now) {
                         currentActiveCacheBlocks.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                        // stats.blockActiveCount.sample(count);
-                        // stats.frontierSize.sample(directory->workCount());
+                        stats.currentFrontierSize.sample(currentDirectory->workCount());
+                        stats.currentBlockActiveCount.sample(count);
                     }
                     if (atom_active_future) {
                         futureActiveCacheBlocks.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                        stats.futureFrontierSize.sample(futureDirectory->workCount());
+                        stats.futureBlockActiveCount.sample(count);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -457,14 +459,16 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
                 currentActiveCacheBlocks.push_back(block_index);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
             }
             if (atom_active_future) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = futureDirectory->deactivate(addr);
                 futureActiveCacheBlocks.push_back(block_index);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -522,15 +526,17 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 atom_active_future |= items[index].activeFuture;
             }
             if (atom_active_now) {
-                // TODO: Add sampling of blockActiveCount and frontierSize here
                 int count = currentDirectory->deactivate(addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->deactivate(addr);
+                    int count = futureDirectory->deactivate(addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 activeBuffer.emplace_back(pkt, curTick());
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
             } else {
+                stats.wastefulBytesRead += pkt->getSize();
                 delete pkt;
             }
 
@@ -686,15 +692,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
                 if (atom_active_now) {
-                    // TODO: Sample frontier size and blockCount here.
                     currentActiveCacheBlocks.erase(block_index);
                     int count = currentDirectory->activate(cacheBlocks[block_index].addr);
-                    // stats.blockActiveCount.sample(count);
-                    // stats.frontierSize.sample(directory->workCount());
+                    stats.currentFrontierSize.sample(currentDirectory->workCount());
+                    stats.currentBlockActiveCount.sample(count);
                 }
                 if (atom_active_future) {
                     futureActiveCacheBlocks.erase(block_index);
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -932,17 +939,21 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
+                stats.currentFrontierSize.sample(currentDirectory->workCount());
+                stats.currentBlockActiveCount.sample(count);
                 if (atom_active_future) {
-                    int count_2 = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                    stats.futureFrontierSize.sample(futureDirectory->workCount());
+                    stats.futureBlockActiveCount.sample(count);
                 }
-                // stats.blockActiveCount.sample(count);
-                // stats.frontierSize.sample(directory->workCount());
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
             }
         } else {
             if (atom_active_future) {
                 int count = futureDirectory->activate(cacheBlocks[block_index].addr);
+                stats.futureFrontierSize.sample(futureDirectory->workCount());
+                stats.futureBlockActiveCount.sample(count);
             }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
@@ -956,7 +967,6 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                             "the current write back scheduled at tick %lu for "
                             "the right function scheduled later.\n",
                             __func__, block_index, schedule_tick);
-        stats.numInvalidWriteBacks++;
     }
 }
 
@@ -1141,8 +1151,8 @@ CoalesceEngine::processNextApplyEvent()
             }
         }
     } else {
-        DPRINTF(CoalesceEngine, "%s: Could not find "
-                        "work to apply.\n", __func__);
+        DPRINTF(CoalesceEngine, "%s: Could not find work to apply.\n", __func__);
+        stats.worklessCycles++;
     }
 
     if (pullCondition()) {
@@ -1184,6 +1194,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "delayed because of port shortage. "),
     ADD_STAT(numMemoryBlocks, statistics::units::Count::get(),
              "Number of times memory bandwidth was not available."),
+    ADD_STAT(wastefulBytesRead, statistics::units::Byte::get(),
+             "Number of bytes read that were not used by coalesce engine"),
     ADD_STAT(verticesPulled, statistics::units::Count::get(),
              "Number of times a pull request has been sent by PushEngine."),
     ADD_STAT(verticesPushed, statistics::units::Count::get(),
@@ -1192,8 +1204,8 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Time of the last pull request. (Relative to reset_stats)"),
     ADD_STAT(lastVertexPushTime, statistics::units::Tick::get(),
              "Time of the last vertex push. (Relative to reset_stats)"),
-    ADD_STAT(numInvalidWriteBacks, statistics::units::Count::get(),
-             "Number of times a scheduled memory function has been invalid."),
+    ADD_STAT(worklessCycles, statistics::units::Count::get(),
+             "cycles the coalesce engine could not find work for apply"),
     ADD_STAT(hitRate, statistics::units::Ratio::get(),
              "Hit rate in the cache."),
     ADD_STAT(vertexPullBW, statistics::units::Rate<statistics::units::Count,
@@ -1202,10 +1214,14 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     ADD_STAT(vertexPushBW, statistics::units::Rate<statistics::units::Count,
                                             statistics::units::Second>::get(),
              "Rate at which vertices are pushed."),
-    ADD_STAT(frontierSize, statistics::units::Count::get(),
-             "Histogram of the length of the bitvector."),
-    ADD_STAT(blockActiveCount, statistics::units::Count::get(),
-             "Histogram of the popCount values in the directory"),
+    ADD_STAT(currentFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the current bitvector."),
+    ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
+             "Histogram of the length of the future bitvector."),
+    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the current directory"),
+    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+             "Histogram of the popCount values in the future directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
     ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
@@ -1225,8 +1241,10 @@ CoalesceEngine::CoalesceStats::regStats()
 
     vertexPushBW = (verticesPushed * getClockFrequency()) / lastVertexPushTime;
 
-    frontierSize.init(64);
-    blockActiveCount.init(64);
+    currentFrontierSize.init(64);
+    futureFrontierSize.init(64);
+    currentBlockActiveCount.init(64);
+    futureBlockActiveCount.init(64);
     responseQueueLatency.init(64);
     memoryFunctionLatency.init(64);
 }
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index c9d8e47f15..8ee17781fc 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -171,18 +171,21 @@ class CoalesceEngine : public BaseMemoryEngine
         statistics::Scalar numConflicts;
         statistics::Scalar responsePortShortage;
         statistics::Scalar numMemoryBlocks;
+        statistics::Scalar wastefulBytesRead;
         statistics::Scalar verticesPulled;
         statistics::Scalar verticesPushed;
         statistics::Scalar lastVertexPullTime;
         statistics::Scalar lastVertexPushTime;
-        statistics::Scalar numInvalidWriteBacks;
+        statistics::Scalar worklessCycles;
 
         statistics::Formula hitRate;
         statistics::Formula vertexPullBW;
         statistics::Formula vertexPushBW;
 
-        statistics::Histogram frontierSize;
-        statistics::Histogram blockActiveCount;
+        statistics::Histogram currentFrontierSize;
+        statistics::Histogram futureFrontierSize;
+        statistics::Histogram currentBlockActiveCount;
+        statistics::Histogram futureBlockActiveCount;
         statistics::Histogram responseQueueLatency;
         statistics::Histogram memoryFunctionLatency;
     };

From 93624ccbddc96f8a561c97a4864f6894d708d528 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sun, 13 Nov 2022 22:51:41 -0800
Subject: [PATCH 236/287] Debugging, finalizing the config and merging new
 workloads.

---
 configs/accl/sega-ddr/bfs.py               |  15 +-
 configs/accl/sega-ddr/cc.py                | 119 +++++++++++
 configs/accl/sega-ddr/sega.py              |  15 +-
 configs/accl/sega-ddr/sssp.py              | 125 +++++++++++
 src/accl/graph/base/graph_workload.cc      | 233 +++++++--------------
 src/accl/graph/base/graph_workload.hh      |  81 ++++---
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/centeral_controller.cc |  18 ++
 src/accl/graph/sega/centeral_controller.hh |   3 +
 src/accl/graph/sega/push_engine.cc         |   1 -
 10 files changed, 408 insertions(+), 205 deletions(-)
 create mode 100644 configs/accl/sega-ddr/cc.py
 create mode 100644 configs/accl/sega-ddr/sssp.py

diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
index 8766822b33..97f1b5dc21 100644
--- a/configs/accl/sega-ddr/bfs.py
+++ b/configs/accl/sega-ddr/bfs.py
@@ -39,6 +39,14 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +81,7 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -87,6 +96,7 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -103,7 +113,10 @@ def get_inputs():
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/sega-ddr/cc.py
new file mode 100644
index 0000000000..9b6d2b587d
--- /dev/null
+++ b/configs/accl/sega-ddr/cc.py
@@ -0,0 +1,119 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_cc_workload()
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
index c5545ee0f1..8325cf7565 100644
--- a/configs/accl/sega-ddr/sega.py
+++ b/configs/accl/sega-ddr/sega.py
@@ -56,8 +56,8 @@ def __init__(self, register_file_size: int, cache_size: str):
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -121,7 +121,7 @@ def __init__(self, size: str):
             dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
         )
         self.xbar = NoncoherentXBar(
-            width=8, frontend_latency=1, forward_latency=1, response_latency=1
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
         )
         self.xbar.mem_side_ports = self.mem_ctrl.port
 
@@ -193,6 +193,15 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sega-ddr/sssp.py
new file mode 100644
index 0000000000..f2e60b856a
--- /dev/null
+++ b/configs/accl/sega-ddr/sssp.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.init_addr,
+        args.init_value,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        init_addr,
+        init_value,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_sssp_workload(init_addr, init_value)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 1fa2b287c4..7471e4d073 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,90 +118,95 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
-        }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+// void
+// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+// {
+//     size_t pkt_size = pkt->getSize();
+//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
+
+//     if (pkt->getAddr() == aligned_addr) {
+//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+//         WorkListItem items[num_elements];
+
+//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+
+//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
+//         items[index].tempProp = initValue;
+//         if (activeCondition(items[index])) {
+//             dir->activate(aligned_addr);
+//         }
+//         pkt->deleteData();
+//         pkt->allocate();
+//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+//     }
+// }
+
+// uint32_t
+// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
+// {
+//     return std::min(update, value);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
+// {
+//     return 1;
+// }
+
+// bool
+// BFSVisitedWorkload::activeCondition(WorkListItem wl)
+// {
+//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
+// }
+
+// uint32_t
+// BFSVisitedWorkload::apply(WorkListItem& wl)
+// {
+//     wl.prop = wl.tempProp;
+//     return wl.prop;
+// }
+
+// std::string
+// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
+// {
+//     return csprintf(
+//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
+//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
+//             );
+// }
 
 uint32_t
-BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return 1;
-}
-
-bool
-BFSVisitedWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-BFSVisitedWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
+BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
+    return value;
 }
 
 void
-SSSPWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
+    Addr pkt_addr = pkt->getAddr();
     size_t pkt_size = pkt->getSize();
-    uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-    if (pkt->getAddr() == aligned_addr) {
-        int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-        WorkListItem items[num_elements];
-
-        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
+    WorkListItem items[num_elements];
 
-        int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-        items[index].tempProp = initValue;
-        if (activeCondition(items[index])) {
-            dir->activate(aligned_addr);
+    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+    bool atom_active = false;
+    for (int i = 0; i < num_elements; i++) {
+        WorkListItem new_wl = items[i];
+        new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
+        bool vertex_active = activeCondition(new_wl, items[i]);
+        if (vertex_active) {
+            new_wl.activeNow = true;
         }
-        pkt->deleteData();
-        pkt->allocate();
-        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-    }
-}
+        items[i] = new_wl;
+        atom_active |= vertex_active;
 
-uint32_t
-SSSPWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
 }
 
 uint32_t
@@ -210,29 +215,6 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
-bool
-SSSPWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-SSSPWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-SSSPWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
-
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -309,61 +291,4 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-void
-CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-{
-    Addr pkt_addr = pkt->getAddr();
-    size_t pkt_size = pkt->getSize();
-    int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-    WorkListItem items[num_elements];
-
-    pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-    bool atom_active = false;
-    for (int i = 0; i < num_elements; i++) {
-        items[i].tempProp = (int) ( pkt_addr / sizeof(WorkListItem)) + i;
-        items[i].prop = -1;
-        atom_active |= activeCondition(items[i]);
-    }
-    if (atom_active) {
-        dir->activate(pkt->getAddr());
-    }
-    pkt->deleteData();
-    pkt->allocate();
-    pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-}
-
-uint32_t
-CCWorkload::reduce(uint32_t update, uint32_t value)
-{
-    return std::min(update, value);
-}
-
-uint32_t
-CCWorkload::propagate(uint32_t value, uint32_t weight)
-{
-    return value;
-}
-
-bool
-CCWorkload::activeCondition(WorkListItem wl)
-{
-    return (wl.tempProp < wl.prop) && (wl.degree > 0);
-}
-
-uint32_t
-CCWorkload::apply(WorkListItem& wl)
-{
-    wl.prop = wl.tempProp;
-    return wl.prop;
-}
-
-std::string
-CCWorkload::printWorkListItem(const WorkListItem wl)
-{
-    return csprintf(
-            "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-            wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-            );
-}
-
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fdd4928e10..fa722a634e 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -78,49 +78,31 @@ class BFSWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class BFSVisitedWorkload : public GraphWorkload
+class BFSVisitedWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    BFSVisitedWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    BFSVisitedWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
+};
 
-    ~BFSVisitedWorkload() {}
-
+class CCWorkload : public BFSVisitedWorkload
+{
+  public:
+    CCWorkload(): BFSVisitedWorkload(0, 0) {}
     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class SSSPWorkload : public GraphWorkload
+class SSSPWorkload : public BFSWorkload
 {
-  private:
-    uint64_t initAddr;
-    uint32_t initValue;
-
   public:
-    SSSPWorkload(uint64_t init_addr, uint32_t init_value):
-        initAddr(init_addr), initValue(init_value)
+    SSSPWorkload(Addr init_addr, uint32_t init_value):
+        BFSWorkload(init_addr, init_value)
     {}
-
-    ~SSSPWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
-
 class BSPPRWorkload : public GraphWorkload
 {
   private:
@@ -140,21 +122,28 @@ class BSPPRWorkload : public GraphWorkload
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-class CCWorkload : public GraphWorkload
-{
-
-  public:
-    CCWorkload() {}
-
-    ~CCWorkload() {}
-
-    virtual void init(PacketPtr pkt, WorkDirectory* dir);
-    virtual uint32_t reduce(uint32_t update, uint32_t value);
-    virtual uint32_t propagate(uint32_t value, uint32_t weight);
-    virtual uint32_t apply(WorkListItem& wl);
-    virtual bool activeCondition(WorkListItem wl);
-    virtual std::string printWorkListItem(const WorkListItem wl);
-};
+// class BSPBCWorkload : public GraphWorkload
+// {
+//   private:
+//     int currentDepth;
+//     Addr initAddr;
+//     uint32_t initValue;
+
+//   public:
+//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
+//         currentDepth(1), initAddr(init_addr), initValue(init_value)
+//     {}
+
+//     ~BSPBCWorkload() {}
+
+//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
+//     virtual uint32_t reduce(uint32_t update, uint32_t value);
+//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
+//     virtual uint32_t apply(WorkListItem& wl);
+//     virtual void interIterationInit(WorkListItem& wl);
+//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+//     virtual std::string printWorkListItem(const WorkListItem wl);
+// };
 
 }
 
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index bda2fa3d6a..f3210a8ec3 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -46,6 +46,9 @@ class CenteralController(ClockedObject):
                     PyBindMethod("setBSPMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
+                    PyBindMethod("createBFSVisitedWorkload"),
+                    PyBindMethod("createSSSPWorkload"),
+                    PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 26e4473b03..8414aee259 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -57,6 +57,24 @@ CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
     workload = new BFSWorkload(init_addr, init_value);
 }
 
+void
+CenteralController::createBFSVisitedWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BFSVisitedWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createSSSPWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new SSSPWorkload(init_addr, init_value);
+}
+
+void
+CenteralController::createCCWorkload()
+{
+    workload = new CCWorkload();
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ab039e5024..aa3938353d 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -70,6 +70,9 @@ class CenteralController : public ClockedObject
     void createPopCountDirectory(int atoms_per_block);
 
     void createBFSWorkload(Addr init_addr, uint32_t init_value);
+    void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
+    void createSSSPWorkload(Addr init_addr, uint32_t init_value);
+    void createCCWorkload();
     void createPRWorkload(float alpha);
 
     void recvDoneSignal();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 09f29a43e4..a8c9a1bcb1 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -411,7 +411,6 @@ PushEngine::createUpdatePacket(Addr addr, T value)
     // bits
     req->setPC(((Addr) 1) << 2);
 
-    // FIXME: MemCmd::UpdateWL
     PacketPtr pkt = new Packet(req, MemCmd::UpdateWL);
 
     pkt->allocate();

From aee9d09f4fbf08f7a2c6f4a81957a82546a8f0bf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:15:59 -0800
Subject: [PATCH 237/287] Fixing port proxy bug of limiting size to int.

---
 src/accl/graph/base/graph_workload.cc      |  8 ++------
 src/accl/graph/sega/centeral_controller.cc | 12 +++++++-----
 src/accl/graph/sega/mpu.hh                 |  1 +
 src/mem/port_proxy.cc                      |  6 +++---
 src/mem/port_proxy.hh                      | 18 +++++++++---------
 src/mem/translating_port_proxy.cc          |  6 +++---
 src/mem/translating_port_proxy.hh          |  6 +++---
 7 files changed, 28 insertions(+), 29 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7471e4d073..38f11778b6 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -193,13 +193,9 @@ CCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
         new_wl.tempProp = (int) (pkt_addr / sizeof(WorkListItem)) + i;
-        bool vertex_active = activeCondition(new_wl, items[i]);
-        if (vertex_active) {
-            new_wl.activeNow = true;
-        }
+        new_wl.activeNow = activeCondition(new_wl, items[i]);
+        atom_active |= new_wl.activeNow;
         items[i] = new_wl;
-        atom_active |= vertex_active;
-
     }
     if (atom_active) {
         dir->activate(pkt->getAddr());
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 8414aee259..970a0572c5 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -101,6 +101,7 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
 void
 CenteralController::startup()
 {
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
         addrRangeListMap[mpu] = mpu->getAddrRanges();
         mpu->setProcessingMode(mode);
@@ -126,7 +127,7 @@ CenteralController::startup()
                 mpu->recvFunctional(pkt);
             }
         }
-    }, system->cacheLineSize());
+    }, vertex_atom);
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
@@ -190,18 +191,19 @@ CenteralController::workCount()
 void
 CenteralController::printAnswerToHostSimout()
 {
-    int num_items = system->cacheLineSize() / sizeof(WorkListItem);
+    unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int num_items = vertex_atom / sizeof(WorkListItem);
     WorkListItem items[num_items];
-    for (Addr addr = 0; addr < maxVertexAddr; addr += system->cacheLineSize())
+    for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
     {
-        PacketPtr pkt = createReadPacket(addr, system->cacheLineSize());
+        PacketPtr pkt = createReadPacket(addr, vertex_atom);
         for (auto mpu: mpuVector) {
             AddrRangeList range_list = addrRangeListMap[mpu];
             if (contains(range_list, addr)) {
                 mpu->recvFunctional(pkt);
             }
         }
-        pkt->writeDataToBlock((uint8_t*) items, system->cacheLineSize());
+        pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
         for (int i = 0; i < num_items; i++) {
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
                                         workload->printWorkListItem(items[i]));
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 04393db36d..95d3adeca5 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -67,6 +67,7 @@ class MPU : public SimObject
     void createAsyncPopCountDirectory(int atoms_per_block) { coalesceEngine->createAsyncPopCountDirectory(atoms_per_block); }
     void createBSPPopCountDirectory(int atoms_per_block) { coalesceEngine->createBSPPopCountDirectory(atoms_per_block); }
 
+    unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
diff --git a/src/mem/port_proxy.cc b/src/mem/port_proxy.cc
index 19e1a53e84..55145ab7d7 100644
--- a/src/mem/port_proxy.cc
+++ b/src/mem/port_proxy.cc
@@ -56,7 +56,7 @@ PortProxy::PortProxy(const RequestPort &port, unsigned int cache_line_size) :
 
 void
 PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
-                        void *p, int size) const
+                        void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -73,7 +73,7 @@ PortProxy::readBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
-                         const void *p, int size) const
+                         const void *p, Addr size) const
 {
     for (ChunkGenerator gen(addr, size, _cacheLineSize); !gen.done();
          gen.next()) {
@@ -90,7 +90,7 @@ PortProxy::writeBlobPhys(Addr addr, Request::Flags flags,
 
 void
 PortProxy::memsetBlobPhys(Addr addr, Request::Flags flags,
-                          uint8_t v, int size) const
+                          uint8_t v, Addr size) const
 {
     // quick and dirty...
     uint8_t *buf = new uint8_t[size];
diff --git a/src/mem/port_proxy.hh b/src/mem/port_proxy.hh
index 29f6ba60a4..8cd21322ea 100644
--- a/src/mem/port_proxy.hh
+++ b/src/mem/port_proxy.hh
@@ -120,19 +120,19 @@ class PortProxy : FunctionalRequestProtocol
      * Read size bytes memory at physical address and store in p.
      */
     void readBlobPhys(Addr addr, Request::Flags flags,
-                      void *p, int size) const;
+                      void *p, Addr size) const;
 
     /**
      * Write size bytes from p to physical address.
      */
     void writeBlobPhys(Addr addr, Request::Flags flags,
-                       const void *p, int size) const;
+                       const void *p, Addr size) const;
 
     /**
      * Fill size bytes starting at physical addr with byte value val.
      */
     void memsetBlobPhys(Addr addr, Request::Flags flags,
-                        uint8_t v, int size) const;
+                        uint8_t v, Addr size) const;
 
 
 
@@ -143,7 +143,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryReadBlob(Addr addr, void *p, int size) const
+    tryReadBlob(Addr addr, void *p, Addr size) const
     {
         readBlobPhys(addr, 0, p, size);
         return true;
@@ -154,7 +154,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryWriteBlob(Addr addr, const void *p, int size) const
+    tryWriteBlob(Addr addr, const void *p, Addr size) const
     {
         writeBlobPhys(addr, 0, p, size);
         return true;
@@ -165,7 +165,7 @@ class PortProxy : FunctionalRequestProtocol
      * Returns true on success and false on failure.
      */
     virtual bool
-    tryMemsetBlob(Addr addr, uint8_t val, int size) const
+    tryMemsetBlob(Addr addr, uint8_t val, Addr size) const
     {
         memsetBlobPhys(addr, 0, val, size);
         return true;
@@ -179,7 +179,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryReadBlob, but insists on success.
      */
     void
-    readBlob(Addr addr, void *p, int size) const
+    readBlob(Addr addr, void *p, Addr size) const
     {
         if (!tryReadBlob(addr, p, size))
             fatal("readBlob(%#x, ...) failed", addr);
@@ -189,7 +189,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryWriteBlob, but insists on success.
      */
     void
-    writeBlob(Addr addr, const void *p, int size) const
+    writeBlob(Addr addr, const void *p, Addr size) const
     {
         if (!tryWriteBlob(addr, p, size))
             fatal("writeBlob(%#x, ...) failed", addr);
@@ -199,7 +199,7 @@ class PortProxy : FunctionalRequestProtocol
      * Same as tryMemsetBlob, but insists on success.
      */
     void
-    memsetBlob(Addr addr, uint8_t v, int size) const
+    memsetBlob(Addr addr, uint8_t v, Addr size) const
     {
         if (!tryMemsetBlob(addr, v, size))
             fatal("memsetBlob(%#x, ...) failed", addr);
diff --git a/src/mem/translating_port_proxy.cc b/src/mem/translating_port_proxy.cc
index 8ab859f40d..bc698c1a07 100644
--- a/src/mem/translating_port_proxy.cc
+++ b/src/mem/translating_port_proxy.cc
@@ -86,7 +86,7 @@ TranslatingPortProxy::tryOnBlob(BaseMMU::Mode mode, TranslationGenPtr gen,
 }
 
 bool
-TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
+TranslatingPortProxy::tryReadBlob(Addr addr, void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Read;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -99,7 +99,7 @@ TranslatingPortProxy::tryReadBlob(Addr addr, void *p, int size) const
 
 bool
 TranslatingPortProxy::tryWriteBlob(
-        Addr addr, const void *p, int size) const
+        Addr addr, const void *p, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
@@ -111,7 +111,7 @@ TranslatingPortProxy::tryWriteBlob(
 }
 
 bool
-TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, int size) const
+TranslatingPortProxy::tryMemsetBlob(Addr addr, uint8_t v, Addr size) const
 {
     constexpr auto mode = BaseMMU::Write;
     return tryOnBlob(mode, _tc->getMMUPtr()->translateFunctional(
diff --git a/src/mem/translating_port_proxy.hh b/src/mem/translating_port_proxy.hh
index bedb57a3ce..7e619784b1 100644
--- a/src/mem/translating_port_proxy.hh
+++ b/src/mem/translating_port_proxy.hh
@@ -77,16 +77,16 @@ class TranslatingPortProxy : public PortProxy
 
     /** Version of tryReadblob that translates virt->phys and deals
       * with page boundries. */
-    bool tryReadBlob(Addr addr, void *p, int size) const override;
+    bool tryReadBlob(Addr addr, void *p, Addr size) const override;
 
     /** Version of tryWriteBlob that translates virt->phys and deals
       * with page boundries. */
-    bool tryWriteBlob(Addr addr, const void *p, int size) const override;
+    bool tryWriteBlob(Addr addr, const void *p, Addr size) const override;
 
     /**
      * Fill size bytes starting at addr with byte value val.
      */
-    bool tryMemsetBlob(Addr address, uint8_t  v, int size) const override;
+    bool tryMemsetBlob(Addr address, uint8_t  v, Addr size) const override;
 };
 
 } // namespace gem5

From eb22da3749dbb7f17e1464c912cb6314e6cb414b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 11:31:29 -0800
Subject: [PATCH 238/287] Fixing postConsumeProcess.

---
 src/accl/graph/sega/coalesce_engine.cc | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 263e08d901..4fa400a63a 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -115,7 +115,9 @@ void
 CoalesceEngine::postConsumeProcess()
 {
     WorkListItem items[numElementsPerLine];
-    for (Addr addr = 0; addr <= lastAtomAddr; addr += peerMemoryAtomSize) {
+    Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
+    for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
+        Addr addr = peerMemoryRange.addIntlvBits(local_addr);
         int block_index = getBlockIndex(addr);
         if (cacheBlocks[block_index].addr == addr) {
             assert(cacheBlocks[block_index].valid);
@@ -125,11 +127,6 @@ CoalesceEngine::postConsumeProcess()
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
                 assert(!cacheBlocks[block_index].items[index].activeNow);
-                // if (cacheBlocks[block_index].items[index].activeFuture) {
-                //     graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
-                //     cacheBlocks[block_index].items[index].activeNow = true;
-                //     cacheBlocks[block_index].items[index].activeFuture = false;
-                // }
                 atom_active_future_before |= cacheBlocks[block_index].items[index].activeFuture;
                 graphWorkload->interIterationInit(cacheBlocks[block_index].items[index]);
                 atom_active_future_after |= cacheBlocks[block_index].items[index].activeFuture;

From 1acdbb465257bf3f57ab9b4ff2de31fc4bd8fde0 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 18:18:28 -0800
Subject: [PATCH 239/287] Addding BC.

---
 src/accl/graph/base/graph_workload.cc      | 157 +++++++++++++--------
 src/accl/graph/base/graph_workload.hh      |  52 ++++---
 src/accl/graph/sega/centeral_controller.cc |  10 ++
 3 files changed, 140 insertions(+), 79 deletions(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 38f11778b6..6ac2018629 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -118,63 +118,6 @@ BFSWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
-// void
-// BFSVisitedWorkload::init(PacketPtr pkt, WorkDirectory* dir)
-// {
-//     size_t pkt_size = pkt->getSize();
-//     uint64_t aligned_addr = roundDown<uint64_t, size_t>(initAddr, pkt_size);
-
-//     if (pkt->getAddr() == aligned_addr) {
-//         int num_elements = (int) (pkt_size / sizeof(WorkListItem));
-//         WorkListItem items[num_elements];
-
-//         pkt->writeDataToBlock((uint8_t*) items, pkt_size);
-
-//         int index = (int) ((initAddr - aligned_addr) / sizeof(WorkListItem));
-//         items[index].tempProp = initValue;
-//         if (activeCondition(items[index])) {
-//             dir->activate(aligned_addr);
-//         }
-//         pkt->deleteData();
-//         pkt->allocate();
-//         pkt->setDataFromBlock((uint8_t*) items, pkt_size);
-//     }
-// }
-
-// uint32_t
-// BFSVisitedWorkload::reduce(uint32_t update, uint32_t value)
-// {
-//     return std::min(update, value);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight)
-// {
-//     return 1;
-// }
-
-// bool
-// BFSVisitedWorkload::activeCondition(WorkListItem wl)
-// {
-//     return (wl.tempProp < wl.prop) && (wl.degree > 0);
-// }
-
-// uint32_t
-// BFSVisitedWorkload::apply(WorkListItem& wl)
-// {
-//     wl.prop = wl.tempProp;
-//     return wl.prop;
-// }
-
-// std::string
-// BFSVisitedWorkload::printWorkListItem(const WorkListItem wl)
-// {
-//     return csprintf(
-//             "WorkListItem{tempProp: %u, prop: %u, degree: %u, edgeIndex: %u}",
-//             wl.tempProp, wl.prop, wl.degree, wl.edgeIndex
-//             );
-// }
-
 uint32_t
 BFSVisitedWorkload::propagate(uint32_t value, uint32_t weight) {
     return value;
@@ -287,4 +230,104 @@ BSPPRWorkload::printWorkListItem(const WorkListItem wl)
             wl.activeFuture ? "true" : "false");
 }
 
+void
+BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int pkt_size = pkt->getSize();
+    int aligned_addr = roundDown<uint32_t, size_t>(initAddr, pkt_size);
+
+    if (aligned_addr == pkt->getAddr()) {
+        int num_elements = pkt_size / sizeof(WorkListItem);
+        WorkListItem items[num_elements];
+        pkt->writeDataToBlock((uint8_t*) items, pkt_size);
+        int index = (initAddr - aligned_addr) / sizeof(WorkListItem);
+        WorkListItem new_wl = items[index];
+        uint32_t prop = 0;
+        prop |= initValue;
+        // NOTE: Depth of the initial vertex is 0.
+        prop &= (4294967295U >> 8);
+        new_wl.tempProp = prop;
+        new_wl.prop = prop;
+        if (activeCondition(new_wl, items[index])) {
+            new_wl.activeNow = true;
+            dir->activate(aligned_addr);
+        }
+        items[index] = new_wl;
+
+        pkt->deleteData();
+        pkt->allocate();
+        pkt->setDataFromBlock((uint8_t*) items, pkt_size);
+    }
+}
+
+uint32_t
+BSPBCWorkload::reduce(uint32_t update, uint32_t value)
+{
+    uint32_t update_depth = (update & depthMask) >> 24;
+    uint32_t update_count = (update & countMask);
+    assert(update_depth == (currentDepth - 1));
+    uint32_t value_depth = (value & depthMask) >> 24;
+    uint32_t value_count = (value & countMask);
+    if (value_depth == 255) {
+        value_depth = update_depth;
+        value_count = 0;
+    }
+    if (value_depth == currentDepth) {
+        value_count += update_count;
+    }
+    uint32_t ret = 0;
+    ret |= value_count;
+    warn_if(value_count > 16777215, "value count has grown bigger than 16777125."
+                                " This means the algorithm result might not be correct."
+                                " However, the traversal will not be affected."
+                                " Therefore, performane metrics could be used.");
+    // HACK: Make sure to always set the depth correctly even if count
+    // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
+    ret &= (4294967295U >> 8);
+    // NOTE: Now that the depth is securely reset we can copy the correct value.
+    ret |= (value_depth << 24);
+    return ret;
+}
+
+uint32_t
+BSPBCWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    return value;
+}
+
+uint32_t
+BSPBCWorkload::apply(WorkListItem& wl)
+{
+    return wl.prop;
+}
+
+void
+BSPBCWorkload::interIterationInit(WorkListItem& wl)
+{
+    wl.prop = wl.tempProp;
+}
+
+bool
+BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
+    return (depth == currentDepth);
+}
+
+std::string
+BSPBCWorkload::printWorkListItem(WorkListItem wl)
+{
+    uint32_t temp_depth = (wl.tempProp & depthMask) >> 24;
+    uint32_t temp_count = (wl.tempProp & countMask);
+    uint32_t depth = (wl.prop & depthMask) >> 24;
+    uint32_t count = (wl.prop & countMask);
+    return csprintf(
+            "WorkListItem{tempProp: (depth: %d, count: %d), "
+            "prop: (depth: %d, count: %d), degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_depth, temp_count, depth, count, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 } // namespace gem5
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index fa722a634e..4ed3dcf3ac 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual void iterate() = 0;
     virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
     virtual std::string printWorkListItem(const WorkListItem wl) = 0;
@@ -73,6 +74,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
@@ -117,33 +119,39 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 };
 
-// class BSPBCWorkload : public GraphWorkload
-// {
-//   private:
-//     int currentDepth;
-//     Addr initAddr;
-//     uint32_t initValue;
-
-//   public:
-//     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-//         currentDepth(1), initAddr(init_addr), initValue(init_value)
-//     {}
-
-//     ~BSPBCWorkload() {}
-
-//     virtual void init(PacketPtr pkt, WorkDirectory* dir);
-//     virtual uint32_t reduce(uint32_t update, uint32_t value);
-//     virtual uint32_t propagate(uint32_t value, uint32_t weight);
-//     virtual uint32_t apply(WorkListItem& wl);
-//     virtual void interIterationInit(WorkListItem& wl);
-//     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
-//     virtual std::string printWorkListItem(const WorkListItem wl);
-// };
+class BSPBCWorkload : public GraphWorkload
+{
+  private:
+    Addr initAddr;
+    uint32_t initValue;
+
+    int currentDepth;
+
+    uint32_t depthMask;
+    uint32_t countMask;
+  public:
+    BSPBCWorkload(Addr init_addr, uint32_t init_value):
+        currentDepth(0), initAddr(init_addr), initValue(init_value),
+        depthMask(4278190080), countMask(16777215)
+    {}
+
+    ~BSPBCWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() { currentDepth++; }
+    virtual void interIterationInit(WorkListItem& wl);
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
 
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 970a0572c5..15062f1465 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -131,6 +131,11 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
+    // IDEA: Should this be here or after calling start?
+    // Point of iterate here is to set global variables.
+    // At this point, we know that vertex memory has been
+    // initialized and we can initialize global variables.
+    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
@@ -170,6 +175,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
+            // IDEA: Should this be here or after calling start?
+            // Point of iterate here is to update global variables.
+            // At this point, we know that vertex memory has been
+            // updated and we can update global variables.
+            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }

From c6af36c8432cd6057cc4b3bbc0a88c007ef557f5 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 20:58:16 -0800
Subject: [PATCH 240/287] Adding BC and degbugging.

---
 configs/accl/{sega-ddr/pr.py => bc.py}     |  18 +-
 configs/accl/bfs.py                        |  20 +-
 configs/accl/{sega-ddr => }/cc.py          |   0
 configs/accl/sega-ddr/bfs.py               | 138 --------------
 configs/accl/sega-ddr/sega.py              | 209 ---------------------
 configs/accl/sega.py                       |  98 +++++++---
 configs/accl/sega_simple.py                |  96 +++++++---
 configs/accl/{sega-ddr => }/sssp.py        |   0
 src/accl/graph/base/graph_workload.cc      |   9 +-
 src/accl/graph/base/graph_workload.hh      |   4 +-
 src/accl/graph/sega/CenteralController.py  |   1 +
 src/accl/graph/sega/centeral_controller.cc |  18 +-
 src/accl/graph/sega/centeral_controller.hh |   1 +
 13 files changed, 195 insertions(+), 417 deletions(-)
 rename configs/accl/{sega-ddr/pr.py => bc.py} (90%)
 rename configs/accl/{sega-ddr => }/cc.py (100%)
 delete mode 100644 configs/accl/sega-ddr/bfs.py
 delete mode 100644 configs/accl/sega-ddr/sega.py
 rename configs/accl/{sega-ddr => }/sssp.py (100%)

diff --git a/configs/accl/sega-ddr/pr.py b/configs/accl/bc.py
similarity index 90%
rename from configs/accl/sega-ddr/pr.py
rename to configs/accl/bc.py
index ea8a103640..074bee73b9 100644
--- a/configs/accl/sega-ddr/pr.py
+++ b/configs/accl/bc.py
@@ -34,10 +34,12 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("iterations", type=int)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +69,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
-        args.alpha,
+        args.init_addr,
+        args.init_value,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +84,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
-        alpha,
+        init_addr,
+        init_value,
         simple,
         sample,
         verify,
@@ -93,14 +99,14 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_bc_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index ab5de485b1..97f1b5dc21 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -34,10 +34,19 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--visited",
+        dest="visited",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use visitation version of BFS",
+    )
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -67,10 +76,12 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.visited,
         args.simple,
         args.sample,
         args.verify,
@@ -80,10 +91,12 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         init_addr,
         init_value,
+        visited,
         simple,
         sample,
         verify,
@@ -93,14 +106,17 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
 
     system.set_async_mode()
     system.create_pop_count_directory(64)
-    system.create_bfs_workload(init_addr, init_value)
+    if visited:
+        system.create_bfs_visited_workload(init_addr, init_value)
+    else:
+        system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
diff --git a/configs/accl/sega-ddr/cc.py b/configs/accl/cc.py
similarity index 100%
rename from configs/accl/sega-ddr/cc.py
rename to configs/accl/cc.py
diff --git a/configs/accl/sega-ddr/bfs.py b/configs/accl/sega-ddr/bfs.py
deleted file mode 100644
index 97f1b5dc21..0000000000
--- a/configs/accl/sega-ddr/bfs.py
+++ /dev/null
@@ -1,138 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-
-import m5
-import argparse
-
-from m5.objects import *
-
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("num_registers", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-    argparser.add_argument(
-        "--visited",
-        dest="visited",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use visitation version of BFS",
-    )
-    argparser.add_argument(
-        "--simple",
-        dest="simple",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Use simple memory for vertex",
-    )
-    argparser.add_argument(
-        "--sample",
-        dest="sample",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Sample sim stats every 100us",
-    )
-    argparser.add_argument(
-        "--verify",
-        dest="verify",
-        action="store_const",
-        const=True,
-        default=False,
-        help="Print final answer",
-    )
-
-    args = argparser.parse_args()
-
-    return (
-        args.num_gpts,
-        args.num_registers,
-        args.cache_size,
-        args.graph,
-        args.init_addr,
-        args.init_value,
-        args.visited,
-        args.simple,
-        args.sample,
-        args.verify,
-    )
-
-
-if __name__ == "__m5_main__":
-    (
-        num_gpts,
-        num_registers,
-        cache_size,
-        graph,
-        init_addr,
-        init_value,
-        visited,
-        simple,
-        sample,
-        verify,
-    ) = get_inputs()
-
-    if simple:
-        from sega_simple import SEGA
-    else:
-        from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
-    root = Root(full_system=False, system=system)
-
-    m5.instantiate()
-
-    system.set_async_mode()
-    system.create_pop_count_directory(64)
-    if visited:
-        system.create_bfs_visited_workload(init_addr, init_value)
-    else:
-        system.create_bfs_workload(init_addr, init_value)
-    if sample:
-        while True:
-            exit_event = m5.simulate(100000000)
-            print(
-                f"Exited simulation at tick {m5.curTick()} "
-                + f"because {exit_event.getCause()}"
-            )
-            m5.stats.dump()
-            m5.stats.reset()
-            if exit_event.getCause() != "simulate() limit reached":
-                break
-    else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
-    if verify:
-        system.print_answer()
diff --git a/configs/accl/sega-ddr/sega.py b/configs/accl/sega-ddr/sega.py
deleted file mode 100644
index 8325cf7565..0000000000
--- a/configs/accl/sega-ddr/sega.py
+++ /dev/null
@@ -1,209 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-from math import log
-from m5.objects import *
-
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-    intlv_low_bit = log(cache_line_size, 2)
-    intlv_bits = log(num_channels, 2)
-    ret = []
-    for i in range(num_channels):
-        ret.append(
-            AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i,
-            )
-        )
-    return ret, intlv_low_bit + intlv_bits - 1
-
-
-class GPT(SubSystem):
-    def __init__(self, register_file_size: int, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
-        )
-        self.coalesce_engine = CoalesceEngine(
-            attached_memory_atom_size=32,
-            cache_size=cache_size,
-            max_resp_per_cycle=8,
-            pending_pull_limit=64,
-            active_buffer_size=80,
-            post_push_wb_queue_size=64,
-        )
-        self.push_engine = PushEngine(
-            push_req_queue_size=32,
-            attached_memory_atom_size=64,
-            resp_queue_size=4096,
-            max_propagates_per_cycle=8,
-            update_queue_size=32,
-        )
-
-        self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-            dram_2=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-        )
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-
-        self.mpu = MPU(
-            wl_engine=self.wl_engine,
-            coalesce_engine=self.coalesce_engine,
-            push_engine=self.push_engine,
-        )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def getEdgeMemPort(self):
-        return self.push_engine.mem_port
-
-    def setEdgeMemPort(self, port):
-        self.push_engine.mem_port = port
-
-    def set_vertex_range(self, vertex_ranges):
-        self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
-        self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
-
-    def set_vertex_pch_bit(self, pch_bit):
-        self.vertex_mem_ctrl.pch_bit = pch_bit
-
-
-class EdgeMemory(SubSystem):
-    def __init__(self, size: str):
-        super(EdgeMemory, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2.4GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-
-        self.mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
-        )
-        self.xbar = NoncoherentXBar(
-            width=64, frontend_latency=1, forward_latency=1, response_latency=1
-        )
-        self.xbar.mem_side_ports = self.mem_ctrl.port
-
-    def set_image(self, image):
-        self.mem_ctrl.dram.image_file = image
-
-    def getPort(self):
-        return self.xbar.cpu_side_ports
-
-    def setPort(self, port):
-        self.xbar.cpu_side_ports = port
-
-class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
-        assert num_gpts != 0
-        assert num_gpts % 2 == 0
-        assert (num_gpts & (num_gpts - 1)) == 0
-
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = "2GHz"
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        # Building the CenteralController
-        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
-        # Building the EdgeMemories
-        edge_mem = []
-        for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
-            mem.set_image(f"{graph_path}/edgelist_{i}")
-            edge_mem.append(mem)
-        self.edge_mem = edge_mem
-        # Building the GPTs
-        vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
-        )
-        gpts = []
-        for i in range(num_gpts):
-            gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
-            gpt.set_vertex_pch_bit(pch_bit)
-            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def work_count(self):
-        return self.ctrl.workCount()
-
-    def set_async_mode(self):
-        self.ctrl.setAsyncMode()
-
-    def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
-
-    def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
-
-    def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
-
-    def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
-
-    def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
-
-    def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
-
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
-
-    def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index b5ce618f7f..32124731d6 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -68,16 +69,14 @@ def __init__(
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96),
-            dram_2=HBM_2000_4H_1x64(page_policy="close", read_buffer_size=96, write_buffer_size=96)
-        )
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=
-            DDR4_2400_8x8(range=AddrRange(edge_memory_size), in_addr_map=False)
+            dram=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
+            dram_2=HBM_2000_4H_1x64(
+                page_policy="close", read_buffer_size=96, write_buffer_size=96
+            ),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -97,6 +96,12 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_ranges):
         self.vertex_mem_ctrl.dram.range = vertex_ranges[0]
         self.vertex_mem_ctrl.dram_2.range = vertex_ranges[1]
@@ -104,32 +109,65 @@ def set_vertex_range(self, vertex_ranges):
     def set_vertex_pch_bit(self, pch_bit):
         self.vertex_mem_ctrl.pch_bit = pch_bit
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts/2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges, pch_bit = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), 2 * num_mpus, 32
+            AddrRange(start=0, size="4GiB"), 2 * num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_mpus]]
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
             )
             gpt.set_vertex_pch_bit(pch_bit)
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -154,8 +192,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff97134b47..ff567b57e3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -47,16 +47,17 @@ def interleave_addresses(plain_range, num_channels, cache_line_size):
 
 
 class GPT(SubSystem):
-    def __init__(
-        self, edge_memory_size: str, cache_size: str):
+    def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
-        self.wl_engine = WLEngine(update_queue_size=64, register_file_size=64)
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
             cache_size=cache_size,
             max_resp_per_cycle=8,
-            pending_pull_limit=32,
-            active_buffer_size=64,
+            pending_pull_limit=64,
+            active_buffer_size=80,
             post_push_wb_queue_size=64,
         )
         self.push_engine = PushEngine(
@@ -67,14 +68,10 @@ def __init__(
             update_queue_size=32,
         )
 
-        self.vertex_mem_ctrl = SimpleMemory(latency="122ns", latency_var="0ns", bandwidth="28GiB/s")
-
-        self.edge_mem_ctrl = MemCtrl(
-            dram=DDR4_2400_8x8(
-                range=AddrRange(edge_memory_size), in_addr_map=False)
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
 
         self.mpu = MPU(
             wl_engine=self.wl_engine,
@@ -94,32 +91,77 @@ def getReqPort(self):
     def setReqPort(self, port):
         self.push_engine.out_ports = port
 
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
     def set_vertex_range(self, vertex_range):
         self.vertex_mem_ctrl.range = vertex_range
 
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.dram.image_file = edge_image
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
 
 class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
         self.clk_domain = SrcClockDomain()
         self.clk_domain.clock = "2GHz"
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
+        # Building the CenteralController
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
         vertex_ranges = interleave_addresses(
-            AddrRange(start=0, size="4GiB"), num_mpus, 32
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
         )
-
         gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("2GiB", cache_size)
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
@@ -144,8 +186,20 @@ def create_pop_count_directory(self, atoms_per_block):
     def create_bfs_workload(self, init_addr, init_value):
         self.ctrl.createBFSWorkload(init_addr, init_value)
 
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
     def print_answer(self):
         self.ctrl.printAnswerToHostSimout()
diff --git a/configs/accl/sega-ddr/sssp.py b/configs/accl/sssp.py
similarity index 100%
rename from configs/accl/sega-ddr/sssp.py
rename to configs/accl/sssp.py
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 6ac2018629..7bcd447b8e 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -245,7 +245,7 @@ BSPBCWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         uint32_t prop = 0;
         prop |= initValue;
         // NOTE: Depth of the initial vertex is 0.
-        prop &= (4294967295U >> 8);
+        prop &= countMask;
         new_wl.tempProp = prop;
         new_wl.prop = prop;
         if (activeCondition(new_wl, items[index])) {
@@ -265,11 +265,10 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
 {
     uint32_t update_depth = (update & depthMask) >> 24;
     uint32_t update_count = (update & countMask);
-    assert(update_depth == (currentDepth - 1));
     uint32_t value_depth = (value & depthMask) >> 24;
     uint32_t value_count = (value & countMask);
     if (value_depth == 255) {
-        value_depth = update_depth;
+        value_depth = currentDepth;
         value_count = 0;
     }
     if (value_depth == currentDepth) {
@@ -283,7 +282,7 @@ BSPBCWorkload::reduce(uint32_t update, uint32_t value)
                                 " Therefore, performane metrics could be used.");
     // HACK: Make sure to always set the depth correctly even if count
     // exceeds the 2^24-1 limit. Here we reset the depth section of ret.
-    ret &= (4294967295U >> 8);
+    ret &= countMask;
     // NOTE: Now that the depth is securely reset we can copy the correct value.
     ret |= (value_depth << 24);
     return ret;
@@ -311,7 +310,7 @@ bool
 BSPBCWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
 {
     uint32_t depth = (new_wl.tempProp & depthMask) >> 24;
-    return (depth == currentDepth);
+    return (depth == currentDepth) && (new_wl.degree > 0);
 }
 
 std::string
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 4ed3dcf3ac..5a55ad4cdc 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -137,8 +137,8 @@ class BSPBCWorkload : public GraphWorkload
     uint32_t countMask;
   public:
     BSPBCWorkload(Addr init_addr, uint32_t init_value):
-        currentDepth(0), initAddr(init_addr), initValue(init_value),
-        depthMask(4278190080), countMask(16777215)
+        initAddr(init_addr), initValue(init_value),
+        currentDepth(0), depthMask(4278190080), countMask(16777215)
     {}
 
     ~BSPBCWorkload() {}
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index f3210a8ec3..7e16b7e7de 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -50,6 +50,7 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
                     PyBindMethod("createPRWorkload"),
+                    PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 15062f1465..86b9ea2b02 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -81,6 +81,12 @@ CenteralController::createPRWorkload(float alpha)
     workload = new BSPPRWorkload(alpha);
 }
 
+void
+CenteralController::createBCWorkload(Addr init_addr, uint32_t init_value)
+{
+    workload = new BSPBCWorkload(init_addr, init_value);
+}
+
 void
 CenteralController::createPopCountDirectory(int atoms_per_block)
 {
@@ -131,17 +137,13 @@ CenteralController::startup()
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
 
-    // IDEA: Should this be here or after calling start?
-    // Point of iterate here is to set global variables.
-    // At this point, we know that vertex memory has been
-    // initialized and we can initialize global variables.
-    workload->iterate();
     for (auto mpu: mpuVector) {
         mpu->postMemInitSetup();
         if (!mpu->running() && (mpu->workCount() > 0)) {
             mpu->start();
         }
     }
+    workload->iterate();
 }
 
 PacketPtr
@@ -175,15 +177,11 @@ CenteralController::recvDoneSignal()
         for (auto mpu: mpuVector) {
             mpu->postConsumeProcess();
             mpu->swapDirectories();
-            // IDEA: Should this be here or after calling start?
-            // Point of iterate here is to update global variables.
-            // At this point, we know that vertex memory has been
-            // updated and we can update global variables.
-            workload->iterate();
             if (!mpu->running() && (mpu->workCount() > 0)) {
                 mpu->start();
             }
         }
+        workload->iterate();
         exitSimLoopNow("finished an iteration.");
     }
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index aa3938353d..ba829061b5 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,6 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createPRWorkload(float alpha);
+    void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 

From 787f7f4f45ffeb9e312f4a9000f58742552b555d Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 14 Nov 2022 21:03:16 -0800
Subject: [PATCH 241/287] Fixing BC run script.

---
 configs/accl/bc.py | 11 ++++-------
 1 file changed, 4 insertions(+), 7 deletions(-)

diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 074bee73b9..56faeb3e4d 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -37,7 +37,6 @@ def get_inputs():
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("graph", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
     argparser.add_argument(
@@ -72,7 +71,6 @@ def get_inputs():
         args.num_registers,
         args.cache_size,
         args.graph,
-        args.iterations,
         args.init_addr,
         args.init_value,
         args.simple,
@@ -87,7 +85,6 @@ def get_inputs():
         num_registers,
         cache_size,
         graph,
-        iterations,
         init_addr,
         init_value,
         simple,
@@ -119,16 +116,16 @@ def get_inputs():
             if exit_event.getCause() != "simulate() limit reached":
                 break
     else:
-        iteration = 0
-        while iteration < iterations:
+        iterations = 0
+        while True:
             exit_event = m5.simulate()
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
             )
-            iteration += 1
+            iterations += 1
             if system.work_count() == 0:
                 break
-    print(f"#iterations: {iteration}")
+    print(f"#iterations: {iterations}")
     if verify:
         system.print_answer()

From b13d005fcb65f7d9e6d97ecc6285044055efa7d7 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 16 Nov 2022 22:54:39 -0800
Subject: [PATCH 242/287] Fixing dirty issue in bsp.

---
 configs/accl/sega.py                   | 2 +-
 configs/accl/sega_simple.py            | 2 +-
 src/accl/graph/sega/coalesce_engine.cc | 5 +++--
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 32124731d6..672151ceed 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -152,7 +152,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts/2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ff567b57e3..06908d08d3 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -147,7 +147,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 4fa400a63a..a2d4378377 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -114,7 +114,6 @@ CoalesceEngine::postMemInitSetup()
 void
 CoalesceEngine::postConsumeProcess()
 {
-    WorkListItem items[numElementsPerLine];
     Addr last_local_atom_addr = peerMemoryRange.removeIntlvBits(lastAtomAddr);
     for (Addr local_addr = 0; local_addr <= last_local_atom_addr; local_addr += peerMemoryAtomSize) {
         Addr addr = peerMemoryRange.addIntlvBits(local_addr);
@@ -133,6 +132,7 @@ CoalesceEngine::postConsumeProcess()
                 if (cacheBlocks[block_index].items[index].activeFuture) {
                     cacheBlocks[block_index].items[index].activeFuture = false;
                     cacheBlocks[block_index].items[index].activeNow = true;
+                    cacheBlocks[block_index].dirty = true;
                 }
             }
             if (!atom_active_future_before && atom_active_future_after) {
@@ -142,10 +142,10 @@ CoalesceEngine::postConsumeProcess()
                 futureActiveCacheBlocks.erase(block_index);
             }
         } else {
+            WorkListItem items[numElementsPerLine];
             PacketPtr read_pkt = createReadPacket(addr, peerMemoryAtomSize);
             memPort.sendFunctional(read_pkt);
             read_pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
-            delete read_pkt;
             bool atom_active_future_before = false;
             bool atom_active_future_after = false;
             for (int index = 0; index < numElementsPerLine; index++) {
@@ -166,6 +166,7 @@ CoalesceEngine::postConsumeProcess()
             }
             PacketPtr write_pkt = createWritePacket(addr, peerMemoryAtomSize, (uint8_t*) items);
             memPort.sendFunctional(write_pkt);
+            delete read_pkt;
             delete write_pkt;
         }
     }

From 7861b6a29700aaaf606a6f4b5a47611aea086c87 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 19:26:29 -0800
Subject: [PATCH 243/287] Adding Async PR.

---
 configs/accl/async-pr.py                   | 125 +++++++++++++++++++++
 configs/accl/pr.py                         |   6 +-
 configs/accl/sega.py                       |   6 +
 configs/accl/sega_simple.py                |   3 +
 src/accl/graph/base/graph_workload.cc      |  78 +++++++++++++
 src/accl/graph/base/graph_workload.hh      |  30 ++++-
 src/accl/graph/sega/CenteralController.py  |   2 +
 src/accl/graph/sega/centeral_controller.cc |  13 +++
 src/accl/graph/sega/centeral_controller.hh |   2 +
 src/accl/graph/sega/coalesce_engine.cc     |  23 +++-
 src/accl/graph/sega/coalesce_engine.hh     |   3 +
 src/accl/graph/sega/wl_engine.cc           |   9 ++
 src/accl/graph/sega/wl_engine.hh           |   3 +
 13 files changed, 294 insertions(+), 9 deletions(-)
 create mode 100644 configs/accl/async-pr.py

diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
new file mode 100644
index 0000000000..0bfb6caeaa
--- /dev/null
+++ b/configs/accl/async-pr.py
@@ -0,0 +1,125 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+
+import m5
+import argparse
+
+from m5.objects import *
+
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("alpha", type=float)
+    argparser.add_argument("threshold", type=float)
+    argparser.add_argument(
+        "--simple",
+        dest="simple",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
+    argparser.add_argument(
+        "--sample",
+        dest="sample",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Sample sim stats every 100us",
+    )
+    argparser.add_argument(
+        "--verify",
+        dest="verify",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Print final answer",
+    )
+
+    args = argparser.parse_args()
+
+    return (
+        args.num_gpts,
+        args.num_registers,
+        args.cache_size,
+        args.graph,
+        args.alpha,
+        args.threshold,
+        args.simple,
+        args.sample,
+        args.verify,
+    )
+
+
+if __name__ == "__m5_main__":
+    (
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph,
+        alpha,
+        threshold,
+        simple,
+        sample,
+        verify,
+    ) = get_inputs()
+
+    if simple:
+        from sega_simple import SEGA
+    else:
+        from sega import SEGA
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    root = Root(full_system=False, system=system)
+
+    m5.instantiate()
+
+    system.set_async_mode()
+    system.create_pop_count_directory(64)
+    system.create_async_pr_workload(alpha, threshold)
+    if sample:
+        while True:
+            exit_event = m5.simulate(100000000)
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            m5.stats.dump()
+            m5.stats.reset()
+            if exit_event.getCause() != "simulate() limit reached":
+                break
+    else:
+        exit_event = m5.simulate()
+        print(
+            f"Exited simulation at tick {m5.curTick()} "
+            + f"because {exit_event.getCause()}"
+        )
+    if verify:
+        system.print_answer()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index ea8a103640..42ae46ea78 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -34,6 +34,7 @@
 def get_inputs():
     argparser = argparse.ArgumentParser()
     argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
@@ -67,6 +68,7 @@ def get_inputs():
 
     return (
         args.num_gpts,
+        args.num_registers,
         args.cache_size,
         args.graph,
         args.iterations,
@@ -80,6 +82,7 @@ def get_inputs():
 if __name__ == "__m5_main__":
     (
         num_gpts,
+        num_registers,
         cache_size,
         graph,
         iterations,
@@ -93,7 +96,7 @@ def get_inputs():
         from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
@@ -121,6 +124,7 @@ def get_inputs():
                 + f"because {exit_event.getCause()}"
             )
             iteration += 1
+            print(f"error: {system.get_pr_error()}")
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 672151ceed..ef23575b9b 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -201,9 +201,15 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
+    def get_pr_error(self):
+        return self.ctrl.getPRError()
+
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
 
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 06908d08d3..d6ae8772a5 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -195,6 +195,9 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
     def create_pr_workload(self, alpha):
         self.ctrl.createPRWorkload(alpha)
 
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 7bcd447b8e..3a401f0963 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -154,6 +154,81 @@ SSSPWorkload::propagate(uint32_t value, uint32_t weight)
     return value + weight;
 }
 
+void
+PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
+{
+    int num_elements = pkt->getSize() / sizeof(WorkListItem);
+    WorkListItem items[num_elements];
+    pkt->writeDataToBlock((uint8_t*) items, pkt->getSize());
+
+    bool atom_active = false;
+    for (int index = 0; index < num_elements; index++) {
+        WorkListItem new_wl = items[index];
+        new_wl.tempProp = readFromFloat<uint32_t>(0);
+        new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
+        atom_active |= activeCondition(new_wl, items[index]);
+        items[index] = new_wl;
+    }
+    if (atom_active) {
+        dir->activate(pkt->getAddr());
+    }
+
+    pkt->deleteData();
+    pkt->allocate();
+    pkt->setDataFromBlock((uint8_t*) items, pkt->getSize());
+}
+
+uint32_t
+PRWorkload::reduce(uint32_t update, uint32_t value)
+{
+    float update_float = writeToFloat<uint32_t>(update);
+    float value_float = writeToFloat<uint32_t>(value);
+    return readFromFloat<uint32_t>(update_float + value_float);
+}
+
+uint32_t
+PRWorkload::propagate(uint32_t value, uint32_t weight)
+{
+    float value_float = writeToFloat<uint32_t>(value);
+    float weight_float = writeToFloat<uint32_t>(weight);
+    if (weight == 0) {
+        weight_float = 1.0;
+    }
+    return readFromFloat<uint32_t>(alpha * value_float * weight_float);
+}
+
+bool
+PRWorkload::activeCondition(WorkListItem new_wl, WorkListItem old_wl)
+{
+    float temp_float = writeToFloat<uint32_t>(new_wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(new_wl.prop);
+    float dist = std::abs(temp_float - prop_float);
+    return (dist >= threshold) && (new_wl.degree > 0);
+}
+
+uint32_t
+PRWorkload::apply(WorkListItem& wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    float delta = (temp_float - prop_float) / wl.degree;
+    wl.prop = wl.tempProp;
+    return readFromFloat<uint32_t>(delta);
+}
+
+std::string
+PRWorkload::printWorkListItem(const WorkListItem wl)
+{
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    return csprintf(
+            "WorkListItem{tempProp: %f, prop: %f, degree: %u, "
+            "edgeIndex: %u, activeNow: %s, activeFuture: %s}",
+            temp_float, prop_float, wl.degree, wl.edgeIndex,
+            wl.activeNow ? "true" : "false",
+            wl.activeFuture ? "true" : "false");
+}
+
 void
 BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
 {
@@ -212,6 +287,9 @@ BSPPRWorkload::apply(WorkListItem& wl)
 void
 BSPPRWorkload::interIterationInit(WorkListItem& wl)
 {
+    float temp_float = writeToFloat<uint32_t>(wl.tempProp);
+    float prop_float = writeToFloat<uint32_t>(wl.prop);
+    error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
     wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
     wl.activeFuture = (wl.degree > 0);
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 5a55ad4cdc..d42bfd0f26 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -105,13 +105,37 @@ class SSSPWorkload : public BFSWorkload
     virtual uint32_t propagate(uint32_t value, uint32_t weight) override;
 };
 
+class PRWorkload : public GraphWorkload
+{
+  private:
+    float alpha;
+    float threshold;
+
+  public:
+    PRWorkload(float alpha, float threshold):
+        alpha(alpha), threshold(threshold)
+    {}
+
+    ~PRWorkload() {}
+
+    virtual void init(PacketPtr pkt, WorkDirectory* dir);
+    virtual uint32_t reduce(uint32_t update, uint32_t value);
+    virtual uint32_t propagate(uint32_t value, uint32_t weight);
+    virtual uint32_t apply(WorkListItem& wl);
+    virtual void iterate() {}
+    virtual void interIterationInit(WorkListItem& wl) {};
+    virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
+    virtual std::string printWorkListItem(const WorkListItem wl);
+};
+
 class BSPPRWorkload : public GraphWorkload
 {
   private:
     float alpha;
+    float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha) {}
+    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
 
     ~BSPPRWorkload() {}
 
@@ -119,10 +143,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() {}
+    virtual void iterate() { error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
+
+    float getError() { return error; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 7e16b7e7de..c5f44c82e9 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -49,8 +49,10 @@ class CenteralController(ClockedObject):
                     PyBindMethod("createBFSVisitedWorkload"),
                     PyBindMethod("createSSSPWorkload"),
                     PyBindMethod("createCCWorkload"),
+                    PyBindMethod("createAsyncPRWorkload"),
                     PyBindMethod("createPRWorkload"),
                     PyBindMethod("createBCWorkload"),
                     PyBindMethod("workCount"),
+                    PyBindMethod("getPRError"),
                     PyBindMethod("printAnswerToHostSimout")
                 ]
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 86b9ea2b02..23eb6bbc0e 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -75,6 +75,12 @@ CenteralController::createCCWorkload()
     workload = new CCWorkload();
 }
 
+void
+CenteralController::createAsyncPRWorkload(float alpha, float threshold)
+{
+    workload = new PRWorkload(alpha, threshold);
+}
+
 void
 CenteralController::createPRWorkload(float alpha)
 {
@@ -196,6 +202,13 @@ CenteralController::workCount()
     return work_count;
 }
 
+float
+CenteralController::getPRError()
+{
+    BSPPRWorkload* pr_workload = dynamic_cast<BSPPRWorkload*>(workload);
+    return pr_workload->getError();
+}
+
 void
 CenteralController::printAnswerToHostSimout()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index ba829061b5..e73ed22666 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -73,12 +73,14 @@ class CenteralController : public ClockedObject
     void createBFSVisitedWorkload(Addr init_addr, uint32_t init_value);
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
+    void createAsyncPRWorkload(float alpha, float threshold);
     void createPRWorkload(float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();
 
     int workCount();
+    float getPRError();
     void printAnswerToHostSimout();
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a2d4378377..02c98ba640 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -59,6 +59,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     nextApplyEvent([this] {
         processNextApplyEvent();
         }, name() + ".nextApplyEvent"),
+    nextDoneSignalEvent([this] {
+        processNextDoneSignalEvent();
+        }, name() + ".nextDoneSignalEvent"),
     stats(*this)
 {
     assert(isPowerOf2(numLines) && isPowerOf2(numElementsPerLine));
@@ -552,8 +555,8 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         }
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
     return true;
 }
@@ -712,8 +715,9 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                 block_index, cacheBlocks[block_index].to_string());
     stats.numVertexWrites++;
 
-    if ((cacheBlocks[block_index].state == CacheState::IDLE) && done()) {
-        owner->recvDoneSignal();
+    if ((cacheBlocks[block_index].state == CacheState::IDLE) &&
+        done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -749,8 +753,8 @@ CoalesceEngine::processNextMemoryEvent()
         schedule(nextMemoryEvent, nextCycle());
     }
 
-    if (done()) {
-        owner->recvDoneSignal();
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
     }
 }
 
@@ -1170,6 +1174,13 @@ CoalesceEngine::processNextApplyEvent()
     }
 }
 
+void
+CoalesceEngine::processNextDoneSignalEvent()
+{
+    if (done()) {
+        owner->recvDoneSignal();
+    }
+}
 
 CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
     : statistics::Group(&_coalesce),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 8ee17781fc..b6eec725f9 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -151,6 +151,9 @@ class CoalesceEngine : public BaseMemoryEngine
     EventFunctionWrapper nextApplyEvent;
     void processNextApplyEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct CoalesceStats : public statistics::Group
     {
         CoalesceStats(CoalesceEngine &coalesce);
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index ed91622b43..d563450179 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -43,6 +43,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
     stats(*this)
 {
     for (int i = 0; i < params.port_in_ports_connection_count; ++i) {
@@ -316,6 +317,14 @@ WLEngine::processNextReduceEvent()
     }
     workListFile.clear();
 
+    if (done() && !nextDoneSignalEvent.scheduled()) {
+        schedule(nextDoneSignalEvent, nextCycle());
+    }
+}
+
+void
+WLEngine::processNextDoneSignalEvent()
+{
     if (done()) {
         owner->recvDoneSignal();
     }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 45baaa1e79..fb147e692a 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -90,6 +90,9 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    EventFunctionWrapper nextDoneSignalEvent;
+    void processNextDoneSignalEvent();
+
     struct WorkListStats : public statistics::Group
     {
       WorkListStats(WLEngine &worklist);

From a991328c22c7dfa6b1b1e03d6d18868c651c3c0e Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 17 Nov 2022 20:33:07 -0800
Subject: [PATCH 244/287] Fixing typos.

---
 configs/accl/pr.py                         | 14 ++++++++++++--
 configs/accl/sega.py                       |  4 ++--
 configs/accl/sega_simple.py                |  4 ++--
 src/accl/graph/base/graph_workload.cc      |  6 +++---
 src/accl/graph/base/graph_workload.hh      | 10 +++++++---
 src/accl/graph/sega/centeral_controller.cc |  4 ++--
 src/accl/graph/sega/centeral_controller.hh |  2 +-
 7 files changed, 29 insertions(+), 15 deletions(-)

diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 42ae46ea78..569514eb82 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -36,9 +36,11 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("iterations", type=int)
     argparser.add_argument("graph", type=str)
+    argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
+    argparser.add_argument("--num_nodes", type=int, default=1)
+    argparser.add_argument("--error_threshold", type=float, default=0.0)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -73,6 +75,8 @@ def get_inputs():
         args.graph,
         args.iterations,
         args.alpha,
+        args.num_nodes,
+        args.error_threshold,
         args.simple,
         args.sample,
         args.verify,
@@ -87,11 +91,15 @@ def get_inputs():
         graph,
         iterations,
         alpha,
+        num_nodes,
+        error_threshold,
         simple,
         sample,
         verify,
     ) = get_inputs()
 
+    print(f"error_threshold: {error_threshold}")
+
     if simple:
         from sega_simple import SEGA
     else:
@@ -103,7 +111,7 @@ def get_inputs():
 
     system.set_bsp_mode()
     system.create_pop_count_directory(64)
-    system.create_pr_workload(alpha)
+    system.create_pr_workload(num_nodes, alpha)
     if sample:
         while True:
             exit_event = m5.simulate(100000000)
@@ -125,6 +133,8 @@ def get_inputs():
             )
             iteration += 1
             print(f"error: {system.get_pr_error()}")
+            if system.get_pr_error() < error_threshold:
+                break
             if system.work_count() == 0:
                 break
     print(f"#iterations: {iteration}")
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ef23575b9b..32d0dd26ab 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -204,8 +204,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def get_pr_error(self):
         return self.ctrl.getPRError()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index d6ae8772a5..2d36ec584d 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -198,8 +198,8 @@ def create_cc_workload(self):
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index 3a401f0963..ab58b02b73 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -240,8 +240,8 @@ BSPPRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
     bool atom_active = false;
     for (int i = 0; i < num_elements; i++) {
         WorkListItem new_wl = items[i];
-        new_wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
-        new_wl.prop = readFromFloat<uint32_t>(1);
+        new_wl.tempProp = readFromFloat<uint32_t>((1 - alpha)/numNodes);
+        new_wl.prop = readFromFloat<uint32_t>(1/numNodes);
         new_wl.activeNow = activeCondition(new_wl, items[i]);
         atom_active |= new_wl.activeNow;
         items[i] = new_wl;
@@ -291,7 +291,7 @@ BSPPRWorkload::interIterationInit(WorkListItem& wl)
     float prop_float = writeToFloat<uint32_t>(wl.prop);
     error += std::abs(temp_float - prop_float);
     wl.prop = wl.tempProp;
-    wl.tempProp = readFromFloat<uint32_t>(1 - alpha);
+    wl.tempProp = readFromFloat<uint32_t>((1 - alpha) / numNodes);
     wl.activeFuture = (wl.degree > 0);
 }
 
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index d42bfd0f26..72748502c1 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -131,11 +131,15 @@ class PRWorkload : public GraphWorkload
 class BSPPRWorkload : public GraphWorkload
 {
   private:
+    int numNodes;
     float alpha;
+    float prevError;
     float error;
 
   public:
-    BSPPRWorkload(float alpha): alpha(alpha), error(0) {}
+    BSPPRWorkload(int num_nodes, float alpha):
+        numNodes(num_nodes), alpha(alpha), prevError(0), error(0)
+    {}
 
     ~BSPPRWorkload() {}
 
@@ -143,12 +147,12 @@ class BSPPRWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
-    virtual void iterate() { error = 0; }
+    virtual void iterate() { prevError = error; error = 0; }
     virtual void interIterationInit(WorkListItem& wl);
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
     virtual std::string printWorkListItem(const WorkListItem wl);
 
-    float getError() { return error; }
+    float getError() { return prevError; }
 };
 
 class BSPBCWorkload : public GraphWorkload
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 23eb6bbc0e..0aee3b77ce 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -82,9 +82,9 @@ CenteralController::createAsyncPRWorkload(float alpha, float threshold)
 }
 
 void
-CenteralController::createPRWorkload(float alpha)
+CenteralController::createPRWorkload(int num_nodes, float alpha)
 {
-    workload = new BSPPRWorkload(alpha);
+    workload = new BSPPRWorkload(num_nodes, alpha);
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index e73ed22666..cce9ac2725 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -74,7 +74,7 @@ class CenteralController : public ClockedObject
     void createSSSPWorkload(Addr init_addr, uint32_t init_value);
     void createCCWorkload();
     void createAsyncPRWorkload(float alpha, float threshold);
-    void createPRWorkload(float alpha);
+    void createPRWorkload(int num_nodes, float alpha);
     void createBCWorkload(Addr init_addr, uint32_t init_value);
 
     void recvDoneSignal();

From da4decf6a2960a7489f1d8450069a9314dae21b0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 7 Feb 2023 14:03:15 -0800
Subject: [PATCH 245/287] Fixing init in asyncPR.

---
 src/accl/graph/base/graph_workload.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/base/graph_workload.cc b/src/accl/graph/base/graph_workload.cc
index ab58b02b73..fd802cf275 100644
--- a/src/accl/graph/base/graph_workload.cc
+++ b/src/accl/graph/base/graph_workload.cc
@@ -166,7 +166,8 @@ PRWorkload::init(PacketPtr pkt, WorkDirectory* dir)
         WorkListItem new_wl = items[index];
         new_wl.tempProp = readFromFloat<uint32_t>(0);
         new_wl.prop = readFromFloat<uint32_t>(1 - alpha);
-        atom_active |= activeCondition(new_wl, items[index]);
+        new_wl.activeNow = activeCondition(new_wl, items[index]);
+        atom_active |= new_wl.activeNow;
         items[index] = new_wl;
     }
     if (atom_active) {

From 7256874c4596608c6721768b3f06a1bd21f16879 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 9 Mar 2023 11:27:37 -0800
Subject: [PATCH 246/287] Improving UniqueFIFO implementation.

---
 src/accl/graph/base/data_structs.hh    | 101 +++++++++++++++++++------
 src/accl/graph/sega/coalesce_engine.cc |   6 ++
 src/accl/graph/sega/push_engine.cc     |   6 +-
 src/accl/graph/sega/work_directory.hh  |   1 +
 4 files changed, 87 insertions(+), 27 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index f09a0dd167..a391e0794d 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,7 +34,7 @@
 
 #include <algorithm>
 #include <cassert>
-#include <list>
+#include <deque>
 
 namespace gem5
 {
@@ -137,56 +137,107 @@ template<typename T>
 class UniqueFIFO
 {
   private:
-    std::list<T> fifo;
+    int cap;
+    int pop;
+
+    int* added;
+    int* deleted;
+    std::deque<T> container;
 
   public:
-    UniqueFIFO() {}
+    UniqueFIFO() {
+        cap = 0;
+        pop = 0;
+        added = nullptr;
+        deleted = nullptr;
+    }
 
-    void push_back(T item)
-    {
-        if (!find(item)) {
-            fifo.push_back(item);
+    UniqueFIFO(int size) {
+        cap = size;
+        pop = 0;
+
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
         }
+        container.clear();
     }
 
-    void pop_front()
-    {
-        assert(!fifo.empty());
-        fifo.pop_front();
+    void fix_front() {
+        while(true) {
+            T elem = container.front();
+            if (deleted[elem] > 0) {
+                deleted[elem]--;
+                added[elem]--;
+                container.pop_front();
+            } else {
+                assert(deleted[elem] == 0);
+                assert(added[elem] == 1);
+                break;
+            }
+        }
     }
 
-    T front()
-    {
-        return fifo.front();
+    T front() {
+        fix_front();
+        return container.front();
     }
 
     size_t size() {
-        return fifo.size();
+        return pop;
     }
 
     void clear() {
-        fifo.clear();
+        pop = 0;
+        for (int i = 0; i < cap; i++) {
+            added[i] = 0;
+            deleted[i] = 0;
+        }
+        container.clear();
     }
 
     bool empty() {
-        return fifo.empty();
+        return size() == 0;
     }
 
     bool find(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        return (it != fifo.end());
+        assert(added[item] >= 0);
+        assert(deleted[item] >= 0);
+        int diff = added[item] - deleted[item];
+        assert((diff == 0) || (diff == 1));
+        return (diff == 1);
+    }
+
+    void push_back(T item) {
+        if (!find(item)) {
+            added[item]++;
+            pop++;
+            container.push_back(item);
+        }
+    }
+
+    void pop_front() {
+        T elem = front();
+        assert(added[elem] == 1);
+        added[elem] = 0;
+        pop--;
+        container.pop_front();
     }
 
     void erase(T item) {
-        // std::list<T>::iterator it = std::find(fifo.begin(), fifo.end(), item);
-        auto it = std::find(fifo.begin(), fifo.end(), item);
-        assert(it != fifo.end());
-        fifo.erase(it);
+        assert(find(item));
+        deleted[item]++;
+        pop--;
     }
 
     void operator=(const UniqueFIFO<T>& rhs) {
-        fifo = rhs.fifo;
+        pop = rhs.pop;
+        container = rhs.container;
+        added = rhs.added;
+        deleted = rhs.deleted;
     }
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 02c98ba640..8c38341f48 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -69,6 +69,9 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
+    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+
     activeBuffer.clear();
     postPushWBQueue.clear();
 }
@@ -404,6 +407,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
         ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
+        // TODO: delete purpose
 
         // NOTE: Regardless of where the pkt will go we have to release the
         // reserved space for this pkt in the activeBuffer in case
@@ -553,6 +557,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                 pullsScheduled++;
             }
         }
+        delete purpose;
     }
 
     if (done() && !nextDoneSignalEvent.scheduled()) {
@@ -999,6 +1004,7 @@ CoalesceEngine::processNextPostPushWB(int ignore, Tick schedule_tick)
 void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
+    DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
     pullsScheduled--;
     if (!currentDirectory->empty()) {
         Addr addr = currentDirectory->getNextWork();
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index a8c9a1bcb1..981b581b7c 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -273,7 +273,9 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
 
-    uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
+    // TODO: Change above line to below line.
+    uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
 
@@ -291,7 +293,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
-    delete pkt_data;
+    // delete [] pkt_data;
     delete pkt;
 
     if (!nextPropagateEvent.scheduled()) {
diff --git a/src/accl/graph/sega/work_directory.hh b/src/accl/graph/sega/work_directory.hh
index 18430aee0d..620e97f654 100644
--- a/src/accl/graph/sega/work_directory.hh
+++ b/src/accl/graph/sega/work_directory.hh
@@ -100,6 +100,7 @@ class PopCountDirectory: public WorkDirectory
         for (int index = 0; index < numCounters; index++) {
             popCount[index] = 0;
         }
+        activeBlockIndices = UniqueFIFO<int>(numCounters);
     }
 
     // CAUTION: This should only be called when the work

From cde38f31536744352c42c5cf863b46e636752fdf Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 21 Mar 2023 14:59:01 -0700
Subject: [PATCH 247/287] Improving sim performance for push engine.

---
 src/accl/graph/sega/coalesce_engine.cc |   1 -
 src/accl/graph/sega/push_engine.cc     | 157 ++++++++++++-------------
 src/accl/graph/sega/push_engine.hh     |  14 ++-
 3 files changed, 88 insertions(+), 84 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 8c38341f48..fcdd26ceb4 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -407,7 +407,6 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
         Addr addr = pkt->getAddr();
         int block_index = getBlockIndex(addr);
         ReadPurpose* purpose = pkt->findNextSenderState<ReadPurpose>();
-        // TODO: delete purpose
 
         // NOTE: Regardless of where the pkt will go we have to release the
         // reserved space for this pkt in the activeBuffer in case
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 981b581b7c..4703e27d16 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -29,6 +29,7 @@
 #include "accl/graph/sega/push_engine.hh"
 
 #include "accl/graph/sega/mpu.hh"
+#include "base/intmath.hh"
 #include "debug/PushEngine.hh"
 #include "mem/packet_access.hh"
 #include "sim/sim_exit.hh"
@@ -50,9 +51,13 @@ PushEngine::PushEngine(const Params& params):
     nextUpdatePushEvent([this] { processNextUpdatePushEvent(); }, name()),
     stats(*this)
 {
+    destinationQueues.clear();
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-        outPorts.emplace_back(
-                        name() + ".out_ports" + std::to_string(i), this, i);
+        outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
+        destinationQueues.emplace_back();
+        destinationQueues[i].clear();
+        sourceAndValueMaps.emplace_back();
+        sourceAndValueMaps[i].clear();
     }
 }
 
@@ -73,7 +78,10 @@ PushEngine::init()
 {
     localAddrRange = owner->getAddrRanges();
     for (int i = 0; i < outPorts.size(); i++){
-        portAddrMap[outPorts[i].id()] = outPorts[i].getAddrRanges();
+        AddrRangeList range_list = outPorts[i].getAddrRanges();
+        assert(range_list.size() == 1);
+        AddrRange range = outPorts[i].getAddrRanges().front();
+        portAddrMap.insert(range, i);
     }
 }
 
@@ -108,7 +116,8 @@ PushEngine::ReqPort::recvReqRetry()
     panic_if(blockedPacket == nullptr,
             "Received retry without a blockedPacket.");
 
-    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    DPRINTF(PushEngine, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
     PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
     sendPacket(pkt);
@@ -145,7 +154,7 @@ PushEngine::done()
 {
     bool empty_update_queues = true;
     for (int i = 0; i < outPorts.size(); i++) {
-        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+        empty_update_queues &= destinationQueues[i].empty();
     }
     return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
@@ -273,8 +282,6 @@ PushEngine::handleMemResp(PacketPtr pkt)
     // TODO: in case we need to edit edges, get rid of second statement.
     assert(pkt->isResponse() && (!pkt->isWrite()));
 
-    // uint8_t* pkt_data = new uint8_t [peerMemoryAtomSize];
-    // TODO: Change above line to below line.
     uint8_t pkt_data [peerMemoryAtomSize];
     PushInfo push_info = reqInfoMap[pkt->req];
     pkt->writeDataToBlock(pkt_data, peerMemoryAtomSize);
@@ -293,7 +300,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
 
     onTheFlyMemReqs -= push_info.numElements;
     reqInfoMap.erase(pkt->req);
-    // delete [] pkt_data;
+
     delete pkt;
 
     if (!nextPropagateEvent.scheduled()) {
@@ -316,10 +323,9 @@ PushEngine::processNextPropagateEvent()
 
         uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
-        Update update(meta_edge.src, meta_edge.dst, update_value);
         metaEdgeQueue.pop_front();
 
-        if (enqueueUpdate(update)) {
+        if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
             stats.numPropagates++;
@@ -348,61 +354,54 @@ PushEngine::processNextPropagateEvent()
 }
 
 bool
-PushEngine::enqueueUpdate(Update update)
+PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
 {
-    Addr dst_addr = update.dst;
-    bool found_coalescing = false;
-    bool found_locally = false;
-    bool accepted = false;
-    for (auto range : localAddrRange) {
-        found_locally |= range.contains(dst_addr);
-    }
-    DPRINTF(PushEngine, "%s: Received update: %s.\n", __func__, update.to_string());
-    for (int i = 0; i < outPorts.size(); i++) {
-        AddrRangeList addr_range_list = portAddrMap[outPorts[i].id()];
-        if (contains(addr_range_list, dst_addr)) {
-            DPRINTF(PushEngine, "%s: Update: %s belongs to port %d.\n",
-                        __func__, update.to_string(), outPorts[i].id());
-            DPRINTF(PushEngine, "%s: There are %d updates already "
+    Addr aligned_dst = roundDown<Addr, size_t>(dst, owner->vertexAtomSize());
+    AddrRange update_range(aligned_dst, aligned_dst + owner->vertexAtomSize());
+    auto entry = portAddrMap.contains(update_range);
+    PortID port_id = entry->second;
+
+    DPRINTF(PushEngine, "%s: Update{src: %lu, dst:%lu, value: %u} "
+                        "belongs to port %d.\n",
+                        __func__, src, dst, value, port_id);
+    DPRINTF(PushEngine, "%s: There are %d updates already "
                         "in queue for port %d.\n", __func__,
-                        updateQueues[outPorts[i].id()].size(),
-                        outPorts[i].id());
-            for (auto& entry: updateQueues[outPorts[i].id()]) {
-                Update& curr_update = std::get<0>(entry);
-                if (curr_update.dst == update.dst) {
-                    uint32_t old_value = curr_update.value;
-                    curr_update.value = graphWorkload->reduce(old_value, update.value);
-                    DPRINTF(PushEngine, "%s: found a coalescing opportunity "
-                            "for destination %d with new value: %d by "
-                            "coalescing %d and %d. \n", __func__, update.dst,
-                            curr_update.value, old_value, update.value);
-                    found_coalescing = true;
-                    accepted = true;
-                    stats.updateQueueCoalescions++;
-                }
-            }
-            if ((found_coalescing == false) &&
-                (updateQueues[outPorts[i].id()].size() < updateQueueSize)) {
-                DPRINTF(PushEngine, "%s: There is a free entry available "
-                            "in queue %d.\n", __func__, outPorts[i].id());
-                updateQueues[outPorts[i].id()].emplace_back(update, curTick());
-                DPRINTF(PushEngine, "%s: Emplaced the update at the back "
-                            "of queue for port %d is. Size of queue "
-                            "for port %d is %d.\n", __func__,
-                            outPorts[i].id(), outPorts[i].id(),
-                            updateQueues[outPorts[i].id()].size());
-                accepted = true;
-                stats.updateQueueLength.sample(
-                                        updateQueues[outPorts[i].id()].size());
-            }
+                        destinationQueues[port_id].size(), port_id);
+
+    assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
+
+    if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
+        DPRINTF(PushEngine, "%s: Found an existing update "
+                            "for dst: %lu.\n", __func__, dst);
+        Addr prev_src;
+        uint32_t prev_val;
+        std::tie(prev_src, prev_val) = sourceAndValueMaps[port_id][dst];
+        uint32_t new_val = graphWorkload->reduce(value, prev_val);
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(prev_src, new_val);
+        DPRINTF(PushEngine, "%s: Coalesced Update{src: %lu, dst:%lu, value: %u} "
+                            "with Update{src: %lu, dst:%lu, value: %u} to"
+                            "Update{src: %lu, dst:%lu, value: %u}.\n", __func__,
+                            src, dst, value, prev_src, dst, prev_val,
+                            prev_src, dst, new_val);
+        stats.updateQueueCoalescions++;
+        return true;
+    } else if (destinationQueues[port_id].size() < updateQueueSize) {
+        DPRINTF(PushEngine, "%s: There is a free entry available "
+                            "in queue for port %d.\n", __func__, port_id);
+        destinationQueues[port_id].emplace_back(dst, curTick());
+        sourceAndValueMaps[port_id][dst] = std::make_tuple(src, value);
+        DPRINTF(PushEngine, "%s: Emplaced Update{src: %lu, dst:%lu, value: %u} "
+                            "at the back of queue for port %d. "
+                            "Size of queue for port %d is %d.\n", __func__,
+                            src, dst, value, port_id, port_id,
+                            destinationQueues[port_id].size());
+        stats.updateQueueLength.sample(destinationQueues[port_id].size());
+        if (!nextUpdatePushEvent.scheduled()) {
+            schedule(nextUpdatePushEvent, nextCycle());
         }
+        return true;
     }
-
-    if (accepted && (!nextUpdatePushEvent.scheduled())) {
-        schedule(nextUpdatePushEvent, nextCycle());
-    }
-
-    return accepted;
+    return false;
 }
 
 template<typename T> PacketPtr
@@ -429,30 +428,30 @@ PushEngine::processNextUpdatePushEvent()
 
     for (int i = 0; i < outPorts.size(); i++) {
         if (outPorts[i].blocked()) {
-            DPRINTF(PushEngine, "%s: Port %d blocked.\n",
-                                __func__, outPorts[i].id());
+            DPRINTF(PushEngine, "%s: Port %d blocked.\n", __func__, i);
             continue;
         }
-        DPRINTF(PushEngine, "%s: Port %d available.\n",
-                                __func__, outPorts[i].id());
-        if (updateQueues[outPorts[i].id()].empty()) {
-            DPRINTF(PushEngine, "%s: Respective queue for port "
-                        "%d is empty.\n", __func__, outPorts[i].id());
+        DPRINTF(PushEngine, "%s: Port %d available.\n", __func__, i);
+        if (destinationQueues[i].empty()) {
+            DPRINTF(PushEngine, "%s: Respective queue for "
+                                "port %d is empty.\n", __func__, i);
             continue;
         }
-        DPRINTF(PushEngine, "%s: Respective queue for port "
-                        "%d not empty.\n", __func__, outPorts[i].id());
-        Update update;
+        Addr dst;
         Tick entrance_tick;
-        std::tie(update, entrance_tick) = updateQueues[outPorts[i].id()].front();
-        PacketPtr pkt = createUpdatePacket<uint32_t>(update.dst, update.value);
+        std::tie(dst, entrance_tick) = destinationQueues[i].front();
+        Addr src;
+        uint32_t value;
+        std::tie(src, value) = sourceAndValueMaps[i][dst];
+
+        PacketPtr pkt = createUpdatePacket<uint32_t>(dst, value);
         outPorts[i].sendPacket(pkt);
-        DPRINTF(PushEngine, "%s: Sent update: %s to port %d. "
-                    "Respective queue size is %d.\n", __func__,
-                    update.to_string(), outPorts[i].id(),
-                    updateQueues[outPorts[i].id()].size());
-        updateQueues[outPorts[i].id()].pop_front();
-        if (updateQueues[outPorts[i].id()].size() > 0) {
+        destinationQueues[i].pop_front();
+        sourceAndValueMaps[i].erase(dst);
+        DPRINTF(PushEngine, "%s: Sent Update{src: %lu, dst:%lu, value: %u} to "
+                    "port %d. Respective queue size is %d.\n", __func__,
+                    src, dst, value, i, destinationQueues[i].size());
+        if (destinationQueues[i].size() > 0) {
             next_time_send += 1;
         }
         stats.numUpdates++;
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index f51865acb3..9f489455ac 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -29,10 +29,14 @@
 #ifndef __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_PUSH_ENGINE_HH__
 
+#include <unordered_map>
+#include <vector>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/sega/enums.hh"
+#include "base/addr_range_map.hh"
 #include "base/intmath.hh"
 #include "params/PushEngine.hh"
 
@@ -58,7 +62,6 @@ class PushEngine : public BaseMemoryEngine
         {}
         void sendPacket(PacketPtr pkt);
         bool blocked() { return (blockedPacket != nullptr); }
-        PortID id() { return _id; }
 
       protected:
         virtual bool recvTimingResp(PacketPtr pkt);
@@ -110,12 +113,14 @@ class PushEngine : public BaseMemoryEngine
 
         bool done() { return (_start >= _end); }
     };
+
     struct PushInfo {
         Addr src;
         uint32_t value;
         Addr offset;
         int numElements;
     };
+
     MPU* owner;
     GraphWorkload* graphWorkload;
 
@@ -136,9 +141,10 @@ class PushEngine : public BaseMemoryEngine
 
     int updateQueueSize;
     template<typename T> PacketPtr createUpdatePacket(Addr addr, T value);
-    bool enqueueUpdate(Update update);
-    std::unordered_map<PortID, AddrRangeList> portAddrMap;
-    std::unordered_map<PortID, std::deque<std::tuple<Update, Tick>>> updateQueues;
+    bool enqueueUpdate(Addr src, Addr dst, uint32_t value);
+    std::vector<std::deque<std::tuple<Addr, Tick>>> destinationQueues;
+    std::vector<std::unordered_map<Addr, std::tuple<Addr, uint32_t>>> sourceAndValueMaps;
+    AddrRangeMap<PortID> portAddrMap;
     std::vector<ReqPort> outPorts;
 
     bool vertexSpace();

From c06d62d3f5f53024667cdc15bd04b694e96bbef6 Mon Sep 17 00:00:00 2001
From: Ayaz Akram <yazakram@ucdavis.edu>
Date: Thu, 10 Nov 2022 13:26:44 -0800
Subject: [PATCH 248/287] mem: HBMCtrl changes to allow PC data buses to be in
 different states

This change updates the HBMCtrl such that both pseudo channels
can be in separate states (read or write) at the same time. In
addition, the controller queues are now always split in two
halves for both pseudo channels.

Change-Id: Ifb599e611ad99f6c511baaf245bad2b5c9210a86
---
 src/mem/HBMCtrl.py                       |  2 -
 src/mem/dram_interface.cc                | 20 +++---
 src/mem/hbm_ctrl.cc                      | 68 ++++++++-------------
 src/mem/hbm_ctrl.hh                      |  1 -
 src/mem/mem_ctrl.cc                      | 78 +++++++++++++-----------
 src/mem/mem_ctrl.hh                      |  6 +-
 src/mem/mem_interface.hh                 | 22 +++++++
 src/mem/nvm_interface.cc                 | 10 +--
 src/python/gem5/components/memory/hbm.py |  1 -
 9 files changed, 108 insertions(+), 100 deletions(-)

diff --git a/src/mem/HBMCtrl.py b/src/mem/HBMCtrl.py
index f7355d4b67..f32ffe6f0a 100644
--- a/src/mem/HBMCtrl.py
+++ b/src/mem/HBMCtrl.py
@@ -48,5 +48,3 @@ class HBMCtrl(MemCtrl):
     # gives the best results with following min_r/w_per_switch
     min_reads_per_switch = 64
     min_writes_per_switch = 64
-
-    partitioned_q = Param.Bool(False, "split queues for pseudo channels")
diff --git a/src/mem/dram_interface.cc b/src/mem/dram_interface.cc
index d745fe5a29..d8c6da0a2d 100644
--- a/src/mem/dram_interface.cc
+++ b/src/mem/dram_interface.cc
@@ -1068,13 +1068,14 @@ DRAMInterface::minBankPrep(const MemPacketQueue& queue,
 
                 // latest Tick for which ACT can occur without
                 // incurring additoinal delay on the data bus
-                const Tick tRCD = ctrl->inReadBusState(false) ?
-                                                 tRCD_RD : tRCD_WR;
+                const Tick tRCD = ctrl->inReadBusState(false,
+                                    (MemInterface*)(this)) ? tRCD_RD : tRCD_WR;
                 const Tick hidden_act_max =
                             std::max(min_col_at - tRCD, curTick());
 
                 // When is the earliest the R/W burst can issue?
-                const Tick col_allowed_at = ctrl->inReadBusState(false) ?
+                const Tick col_allowed_at = ctrl->inReadBusState(false,
+                                              (MemInterface*)(this)) ?
                                               ranks[i]->banks[j].rdAllowedAt :
                                               ranks[i]->banks[j].wrAllowedAt;
                 Tick col_at = std::max(col_allowed_at, act_at + tRCD);
@@ -1180,10 +1181,10 @@ bool
 DRAMInterface::Rank::isQueueEmpty() const
 {
     // check commmands in Q based on current bus direction
-    bool no_queued_cmds = (dram.ctrl->inReadBusState(true) &&
-                          (readEntries == 0))
-                       || (dram.ctrl->inWriteBusState(true) &&
-                          (writeEntries == 0));
+    bool no_queued_cmds = (dram.ctrl->inReadBusState(true,
+                          (MemInterface*)(this)) && (readEntries == 0)) ||
+                          (dram.ctrl->inWriteBusState(true,
+                          (MemInterface*)(this)) && (writeEntries == 0));
     return no_queued_cmds;
 }
 
@@ -1669,7 +1670,7 @@ DRAMInterface::Rank::processPowerEvent()
         // completed refresh event, ensure next request is scheduled
         if (!(dram.ctrl->requestEventScheduled(dram.pseudoChannel))) {
             DPRINTF(DRAM, "Scheduling next request after refreshing"
-                           " rank %d\n", rank);
+                           " rank %d, PC %d \n", rank, dram.pseudoChannel);
             dram.ctrl->restartScheduler(curTick(), dram.pseudoChannel);
         }
     }
@@ -1831,7 +1832,8 @@ DRAMInterface::Rank::resetStats() {
 bool
 DRAMInterface::Rank::forceSelfRefreshExit() const {
     return (readEntries != 0) ||
-           (dram.ctrl->inWriteBusState(true) && (writeEntries != 0));
+           (dram.ctrl->inWriteBusState(true, (MemInterface*)(this))
+           && (writeEntries != 0));
 }
 
 void
diff --git a/src/mem/hbm_ctrl.cc b/src/mem/hbm_ctrl.cc
index efd46bbd54..e0d0922333 100644
--- a/src/mem/hbm_ctrl.cc
+++ b/src/mem/hbm_ctrl.cc
@@ -52,8 +52,7 @@ HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
                          name()),
     respondEventPC1([this] {processRespondEvent(pc1Int, respQueuePC1,
                          respondEventPC1, retryRdReqPC1); }, name()),
-    pc1Int(p.dram_2),
-    partitionedQ(p.partitioned_q)
+    pc1Int(p.dram_2)
 {
     DPRINTF(MemCtrl, "Setting up HBM controller\n");
 
@@ -70,17 +69,8 @@ HBMCtrl::HBMCtrl(const HBMCtrlParams &p) :
     pc0Int->setCtrl(this, commandWindow, 0);
     pc1Int->setCtrl(this, commandWindow, 1);
 
-    if (partitionedQ) {
-        writeHighThreshold = (writeBufferSize * (p.write_high_thresh_perc/2)
-                             / 100.0);
-        writeLowThreshold = (writeBufferSize * (p.write_low_thresh_perc/2)
-                            / 100.0);
-    } else {
-        writeHighThreshold = (writeBufferSize * p.write_high_thresh_perc
-                            / 100.0);
-        writeLowThreshold = (writeBufferSize * p.write_low_thresh_perc
-                            / 100.0);
-    }
+    writeHighThreshold = (writeBufferSize/2 * p.write_high_thresh_perc)/100.0;
+    writeLowThreshold = (writeBufferSize/2 * p.write_low_thresh_perc)/100.0;
 }
 
 void
@@ -156,9 +146,9 @@ HBMCtrl::writeQueueFullPC0(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Write queue limit %d, PC0 size %d, entries needed %d\n",
-            writeBufferSize, writeQueueSizePC0, neededEntries);
+            writeBufferSize/2, pc0Int->writeQueueSize, neededEntries);
 
-    unsigned int wrsize_new = (writeQueueSizePC0 + neededEntries);
+    unsigned int wrsize_new = (pc0Int->writeQueueSize + neededEntries);
     return wrsize_new > (writeBufferSize/2);
 }
 
@@ -167,9 +157,9 @@ HBMCtrl::writeQueueFullPC1(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Write queue limit %d, PC1 size %d, entries needed %d\n",
-            writeBufferSize, writeQueueSizePC1, neededEntries);
+            writeBufferSize/2, pc1Int->writeQueueSize, neededEntries);
 
-    unsigned int wrsize_new = (writeQueueSizePC1 + neededEntries);
+    unsigned int wrsize_new = (pc1Int->writeQueueSize + neededEntries);
     return wrsize_new > (writeBufferSize/2);
 }
 
@@ -178,10 +168,10 @@ HBMCtrl::readQueueFullPC0(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Read queue limit %d, PC0 size %d, entries needed %d\n",
-            readBufferSize, readQueueSizePC0 + respQueue.size(),
+            readBufferSize/2, pc0Int->readQueueSize + respQueue.size(),
             neededEntries);
 
-    unsigned int rdsize_new = readQueueSizePC0 + respQueue.size()
+    unsigned int rdsize_new = pc0Int->readQueueSize + respQueue.size()
                                                + neededEntries;
     return rdsize_new > (readBufferSize/2);
 }
@@ -191,26 +181,14 @@ HBMCtrl::readQueueFullPC1(unsigned int neededEntries) const
 {
     DPRINTF(MemCtrl,
             "Read queue limit %d, PC1 size %d, entries needed %d\n",
-            readBufferSize, readQueueSizePC1 + respQueuePC1.size(),
+            readBufferSize/2, pc1Int->readQueueSize + respQueuePC1.size(),
             neededEntries);
 
-    unsigned int rdsize_new = readQueueSizePC1 + respQueuePC1.size()
+    unsigned int rdsize_new = pc1Int->readQueueSize + respQueuePC1.size()
                                                + neededEntries;
     return rdsize_new > (readBufferSize/2);
 }
 
-bool
-HBMCtrl::readQueueFull(unsigned int neededEntries) const
-{
-    DPRINTF(MemCtrl,
-            "HBMCtrl: Read queue limit %d, entries needed %d\n",
-            readBufferSize, neededEntries);
-
-    unsigned int rdsize_new = totalReadQueueSize + respQueue.size() +
-                                respQueuePC1.size() + neededEntries;
-    return rdsize_new > readBufferSize;
-}
-
 bool
 HBMCtrl::recvTimingReq(PacketPtr pkt)
 {
@@ -255,9 +233,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
     // check local buffers and do not accept if full
     if (pkt->isWrite()) {
         if (is_pc0) {
-            if (partitionedQ ? writeQueueFullPC0(pkt_count) :
-                                        writeQueueFull(pkt_count))
-            {
+            if (writeQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Write queue full, not accepting\n");
                 // remember that we have to retry this port
                 MemCtrl::retryWrReq = true;
@@ -265,13 +241,15 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 return false;
             } else {
                 addToWriteQueue(pkt, pkt_count, pc0Int);
+                if (!nextReqEvent.scheduled()) {
+                    DPRINTF(MemCtrl, "Request scheduled immediately\n");
+                    schedule(nextReqEvent, curTick());
+                }
                 stats.writeReqs++;
                 stats.bytesWrittenSys += size;
             }
         } else {
-            if (partitionedQ ? writeQueueFullPC1(pkt_count) :
-                                        writeQueueFull(pkt_count))
-            {
+            if (writeQueueFullPC1(pkt_count)) {
                 DPRINTF(MemCtrl, "Write queue full, not accepting\n");
                 // remember that we have to retry this port
                 retryWrReqPC1 = true;
@@ -279,6 +257,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 return false;
             } else {
                 addToWriteQueue(pkt, pkt_count, pc1Int);
+                if (!nextReqEventPC1.scheduled()) {
+                    DPRINTF(MemCtrl, "Request scheduled immediately\n");
+                    schedule(nextReqEventPC1, curTick());
+                }
                 stats.writeReqs++;
                 stats.bytesWrittenSys += size;
             }
@@ -289,11 +271,10 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
         assert(size != 0);
 
         if (is_pc0) {
-            if (partitionedQ ? readQueueFullPC0(pkt_count) :
-                                        HBMCtrl::readQueueFull(pkt_count)) {
+            if (readQueueFullPC0(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
-                retryRdReqPC1 = true;
+                MemCtrl::retryRdReq = true;
                 stats.numRdRetry++;
                 return false;
             } else {
@@ -308,8 +289,7 @@ HBMCtrl::recvTimingReq(PacketPtr pkt)
                 stats.bytesReadSys += size;
             }
         } else {
-            if (partitionedQ ? readQueueFullPC1(pkt_count) :
-                                        HBMCtrl::readQueueFull(pkt_count)) {
+            if (readQueueFullPC1(pkt_count)) {
                 DPRINTF(MemCtrl, "Read queue full, not accepting\n");
                 // remember that we have to retry this port
                 retryRdReqPC1 = true;
diff --git a/src/mem/hbm_ctrl.hh b/src/mem/hbm_ctrl.hh
index f204b8346f..58cbd57c3b 100644
--- a/src/mem/hbm_ctrl.hh
+++ b/src/mem/hbm_ctrl.hh
@@ -145,7 +145,6 @@ class HBMCtrl : public MemCtrl
      */
     bool readQueueFullPC0(unsigned int pkt_count) const;
     bool readQueueFullPC1(unsigned int pkt_count) const;
-    bool readQueueFull(unsigned int pkt_count) const;
 
     /**
      * Check if the write queue partition of both pseudo
diff --git a/src/mem/mem_ctrl.cc b/src/mem/mem_ctrl.cc
index 3cbacef800..731ce7be39 100644
--- a/src/mem/mem_ctrl.cc
+++ b/src/mem/mem_ctrl.cc
@@ -72,7 +72,6 @@ MemCtrl::MemCtrl(const MemCtrlParams &p) :
     writeLowThreshold(writeBufferSize * p.write_low_thresh_perc / 100.0),
     minWritesPerSwitch(p.min_writes_per_switch),
     minReadsPerSwitch(p.min_reads_per_switch),
-    writesThisTime(0), readsThisTime(0),
     memSchedPolicy(p.mem_sched_policy),
     frontendLatency(p.static_frontend_latency),
     backendLatency(p.static_backend_latency),
@@ -277,6 +276,8 @@ MemCtrl::addToReadQueue(PacketPtr pkt,
             logRequest(MemCtrl::READ, pkt->requestorId(),
                        pkt->qosValue(), mem_pkt->addr, 1);
 
+            mem_intr->readQueueSize++;
+
             // Update stats
             stats.avgRdQLen = totalReadQueueSize + respQueue.size();
         }
@@ -349,6 +350,8 @@ MemCtrl::addToWriteQueue(PacketPtr pkt, unsigned int pkt_count,
             logRequest(MemCtrl::WRITE, pkt->requestorId(),
                        pkt->qosValue(), mem_pkt->addr, 1);
 
+            mem_intr->writeQueueSize++;
+
             assert(totalWriteQueueSize == isInWriteQueue.size());
 
             // Update stats
@@ -575,6 +578,9 @@ MemCtrl::chooseNext(MemPacketQueue& queue, Tick extra_col_delay,
             // check if there is a packet going to a free rank
             for (auto i = queue.begin(); i != queue.end(); ++i) {
                 MemPacket* mem_pkt = *i;
+                if (mem_pkt->pseudoChannel != mem_intr->pseudoChannel) {
+                    continue;
+                }
                 if (packetReady(mem_pkt, mem_intr)) {
                     ret = i;
                     break;
@@ -761,28 +767,28 @@ MemCtrl::verifyMultiCmd(Tick cmd_tick, Tick max_cmds_per_burst,
 }
 
 bool
-MemCtrl::inReadBusState(bool next_state) const
+MemCtrl::inReadBusState(bool next_state, MemInterface* mem_intr) const
 {
     // check the bus state
     if (next_state) {
         // use busStateNext to get the state that will be used
         // for the next burst
-        return (busStateNext == MemCtrl::READ);
+        return (mem_intr->busStateNext == MemCtrl::READ);
     } else {
-        return (busState == MemCtrl::READ);
+        return (mem_intr->busState == MemCtrl::READ);
     }
 }
 
 bool
-MemCtrl::inWriteBusState(bool next_state) const
+MemCtrl::inWriteBusState(bool next_state, MemInterface* mem_intr) const
 {
     // check the bus state
     if (next_state) {
         // use busStateNext to get the state that will be used
         // for the next burst
-        return (busStateNext == MemCtrl::WRITE);
+        return (mem_intr->busStateNext == MemCtrl::WRITE);
     } else {
-        return (busState == MemCtrl::WRITE);
+        return (mem_intr->busState == MemCtrl::WRITE);
     }
 }
 
@@ -813,13 +819,13 @@ MemCtrl::doBurstAccess(MemPacket* mem_pkt, MemInterface* mem_intr)
 
     // Update the common bus stats
     if (mem_pkt->isRead()) {
-        ++readsThisTime;
+        ++(mem_intr->readsThisTime);
         // Update latency stats
         stats.requestorReadTotalLat[mem_pkt->requestorId()] +=
             mem_pkt->readyTime - mem_pkt->entryTime;
         stats.requestorReadBytes[mem_pkt->requestorId()] += mem_pkt->size;
     } else {
-        ++writesThisTime;
+        ++(mem_intr->writesThisTime);
         stats.requestorWriteBytes[mem_pkt->requestorId()] += mem_pkt->size;
         stats.requestorWriteTotalLat[mem_pkt->requestorId()] +=
             mem_pkt->readyTime - mem_pkt->entryTime;
@@ -836,8 +842,8 @@ MemCtrl::memBusy(MemInterface* mem_intr) {
     // Default to busy status and update based on interface specifics
     // Default state of unused interface is 'true'
     bool mem_busy = true;
-    bool all_writes_nvm = mem_intr->numWritesQueued == totalWriteQueueSize;
-    bool read_queue_empty = totalReadQueueSize == 0;
+    bool all_writes_nvm = mem_intr->numWritesQueued == mem_intr->writeQueueSize;
+    bool read_queue_empty = mem_intr->readQueueSize == 0;
     mem_busy = mem_intr->isBusy(read_queue_empty, all_writes_nvm);
     if (mem_busy) {
         // if all ranks are refreshing wait for them to finish
@@ -884,32 +890,32 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     }
 
     // detect bus state change
-    bool switched_cmd_type = (busState != busStateNext);
+    bool switched_cmd_type = (mem_intr->busState != mem_intr->busStateNext);
     // record stats
     recordTurnaroundStats();
 
     DPRINTF(MemCtrl, "QoS Turnarounds selected state %s %s\n",
-            (busState==MemCtrl::READ)?"READ":"WRITE",
+            (mem_intr->busState==MemCtrl::READ)?"READ":"WRITE",
             switched_cmd_type?"[turnaround triggered]":"");
 
     if (switched_cmd_type) {
-        if (busState == MemCtrl::READ) {
+        if (mem_intr->busState == MemCtrl::READ) {
             DPRINTF(MemCtrl,
                     "Switching to writes after %d reads with %d reads "
-                    "waiting\n", readsThisTime, totalReadQueueSize);
-            stats.rdPerTurnAround.sample(readsThisTime);
-            readsThisTime = 0;
+                    "waiting\n", mem_intr->readsThisTime, mem_intr->readQueueSize);
+            stats.rdPerTurnAround.sample(mem_intr->readsThisTime);
+            mem_intr->readsThisTime = 0;
         } else {
             DPRINTF(MemCtrl,
                     "Switching to reads after %d writes with %d writes "
-                    "waiting\n", writesThisTime, totalWriteQueueSize);
-            stats.wrPerTurnAround.sample(writesThisTime);
-            writesThisTime = 0;
+                    "waiting\n", mem_intr->writesThisTime, mem_intr->writeQueueSize);
+            stats.wrPerTurnAround.sample(mem_intr->writesThisTime);
+            mem_intr->writesThisTime = 0;
         }
     }
 
     // updates current state
-    busState = busStateNext;
+    mem_intr->busState = mem_intr->busStateNext;
 
     nonDetermReads(mem_intr);
 
@@ -918,18 +924,18 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     }
 
     // when we get here it is either a read or a write
-    if (busState == READ) {
+    if (mem_intr->busState == READ) {
 
         // track if we should switch or not
         bool switch_to_writes = false;
 
-        if (totalReadQueueSize == 0) {
+        if (mem_intr->readQueueSize == 0) {
             // In the case there is no read request to go next,
             // trigger writes if we have passed the low threshold (or
             // if we are draining)
-            if (!(totalWriteQueueSize == 0) &&
+            if (!(mem_intr->writeQueueSize == 0) &&
                 (drainState() == DrainState::Draining ||
-                 totalWriteQueueSize > writeLowThreshold)) {
+                 mem_intr->writeQueueSize > writeLowThreshold)) {
 
                 DPRINTF(MemCtrl,
                         "Switching to writes due to read queue empty\n");
@@ -1004,6 +1010,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
                         mem_pkt->qosValue(), mem_pkt->getAddr(), 1,
                         mem_pkt->readyTime - mem_pkt->entryTime);
 
+            mem_intr->readQueueSize--;
 
             // Insert into response queue. It will be sent back to the
             // requestor at its readyTime
@@ -1022,8 +1029,8 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
             // there are no other writes that can issue
             // Also ensure that we've issued a minimum defined number
             // of reads before switching, or have emptied the readQ
-            if ((totalWriteQueueSize > writeHighThreshold) &&
-               (readsThisTime >= minReadsPerSwitch || totalReadQueueSize == 0)
+            if ((mem_intr->writeQueueSize > writeHighThreshold) &&
+               (mem_intr->readsThisTime >= minReadsPerSwitch || mem_intr->readQueueSize == 0)
                && !(nvmWriteBlock(mem_intr))) {
                 switch_to_writes = true;
             }
@@ -1038,7 +1045,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
         // draining), or because the writes hit the hight threshold
         if (switch_to_writes) {
             // transition to writing
-            busStateNext = WRITE;
+            mem_intr->busStateNext = WRITE;
         }
     } else {
 
@@ -1092,6 +1099,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
                     mem_pkt->qosValue(), mem_pkt->getAddr(), 1,
                     mem_pkt->readyTime - mem_pkt->entryTime);
 
+        mem_intr->writeQueueSize--;
 
         // remove the request from the queue - the iterator is no longer valid
         writeQueue[mem_pkt->qosValue()].erase(to_write);
@@ -1105,15 +1113,15 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
         // If we are interfacing to NVM and have filled the writeRespQueue,
         // with only NVM writes in Q, then switch to reads
         bool below_threshold =
-            totalWriteQueueSize + minWritesPerSwitch < writeLowThreshold;
+            mem_intr->writeQueueSize + minWritesPerSwitch < writeLowThreshold;
 
-        if (totalWriteQueueSize == 0 ||
+        if (mem_intr->writeQueueSize == 0 ||
             (below_threshold && drainState() != DrainState::Draining) ||
-            (totalReadQueueSize && writesThisTime >= minWritesPerSwitch) ||
-            (totalReadQueueSize && (nvmWriteBlock(mem_intr)))) {
+            (mem_intr->readQueueSize && mem_intr->writesThisTime >= minWritesPerSwitch) ||
+            (mem_intr->readQueueSize && (nvmWriteBlock(mem_intr)))) {
 
             // turn the bus back around for reads again
-            busStateNext = MemCtrl::READ;
+            mem_intr->busStateNext = MemCtrl::READ;
 
             // note that the we switch back to reads also in the idle
             // case, which eventually will check for any draining and
@@ -1126,7 +1134,7 @@ MemCtrl::processNextReqEvent(MemInterface* mem_intr,
     if (!next_req_event.scheduled())
         schedule(next_req_event, std::max(mem_intr->nextReqTime, curTick()));
 
-    if (retry_wr_req && totalWriteQueueSize < writeBufferSize) {
+    if (retry_wr_req && mem_intr->writeQueueSize < writeBufferSize) {
         retry_wr_req = false;
         port.sendRetryReq();
     }
@@ -1400,7 +1408,7 @@ MemCtrl::drain()
 {
     // if there is anything in any of our internal queues, keep track
     // of that as well
-    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQueue.empty() &&
+    if (!(!totalWriteQueueSize && !totalReadQueueSize && respQEmpty() &&
           allIntfDrained())) {
 
         DPRINTF(Drain, "Memory controller not drained, write: %d, read: %d,"
diff --git a/src/mem/mem_ctrl.hh b/src/mem/mem_ctrl.hh
index fe5d478280..fffd05405e 100644
--- a/src/mem/mem_ctrl.hh
+++ b/src/mem/mem_ctrl.hh
@@ -515,8 +515,6 @@ class MemCtrl : public qos::MemCtrl
     uint32_t writeLowThreshold;
     const uint32_t minWritesPerSwitch;
     const uint32_t minReadsPerSwitch;
-    uint32_t writesThisTime;
-    uint32_t readsThisTime;
 
     /**
      * Memory controller configuration initialized based on parameter
@@ -762,7 +760,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a read state
      */
-    bool inReadBusState(bool next_state) const;
+    bool inReadBusState(bool next_state, MemInterface* mem_intr) const;
 
     /**
      * Check the current direction of the memory channel
@@ -770,7 +768,7 @@ class MemCtrl : public qos::MemCtrl
      * @param next_state Check either the current or next bus state
      * @return True when bus is currently in a write state
      */
-    bool inWriteBusState(bool next_state) const;
+    bool inWriteBusState(bool next_state, MemInterface* mem_intr) const;
 
     Port &getPort(const std::string &if_name,
                   PortID idx=InvalidPortID) override;
diff --git a/src/mem/mem_interface.hh b/src/mem/mem_interface.hh
index 8d6f4fe52b..b0f762fc80 100644
--- a/src/mem/mem_interface.hh
+++ b/src/mem/mem_interface.hh
@@ -189,6 +189,28 @@ class MemInterface : public AbstractMemory
     Tick nextBurstAt = 0;
     Tick nextReqTime = 0;
 
+    /**
+     * Reads/writes performed by the controller for this interface before
+     * bus direction is switched
+     */
+    uint32_t readsThisTime = 0;
+    uint32_t writesThisTime = 0;
+
+    /**
+     * Read/write packets in the read/write queue for this interface
+     * qos/mem_ctrl.hh has similar counters, but they track all packets
+     * in the controller for all memory interfaces connected to the
+     * controller.
+     */
+    uint32_t readQueueSize = 0;
+    uint32_t writeQueueSize = 0;
+
+
+    MemCtrl::BusState busState = MemCtrl::READ;
+
+    /** bus state for next request event triggered */
+    MemCtrl::BusState busStateNext = MemCtrl::READ;
+
     /**
      * pseudo channel number used for HBM modeling
      */
diff --git a/src/mem/nvm_interface.cc b/src/mem/nvm_interface.cc
index b2c4073cd9..e77cf59202 100644
--- a/src/mem/nvm_interface.cc
+++ b/src/mem/nvm_interface.cc
@@ -402,9 +402,11 @@ NVMInterface::processReadReadyEvent()
 
 bool
 NVMInterface::burstReady(MemPacket* pkt) const {
-    bool read_rdy =  pkt->isRead() && (ctrl->inReadBusState(true)) &&
-               (pkt->readyTime <= curTick()) && (numReadDataReady > 0);
-    bool write_rdy =  !pkt->isRead() && !ctrl->inReadBusState(true) &&
+    bool read_rdy =  pkt->isRead() && (ctrl->inReadBusState(true,
+                (MemInterface*)(this))) &&
+                (pkt->readyTime <= curTick()) && (numReadDataReady > 0);
+    bool write_rdy =  !pkt->isRead() && !ctrl->inReadBusState(true,
+                (MemInterface*)(this)) &&
                 !writeRespQueueFull();
     return (read_rdy || write_rdy);
 }
@@ -613,7 +615,7 @@ NVMInterface::isBusy(bool read_queue_empty, bool all_writes_nvm)
      // Only assert busy for the write case when there are also
      // no reads in Q and the write queue only contains NVM commands
      // This allows the bus state to switch and service reads
-     return (ctrl->inReadBusState(true) ?
+     return (ctrl->inReadBusState(true, (MemInterface*)(this)) ?
                  (numReadDataReady == 0) && !read_queue_empty :
                  writeRespQueueFull() && read_queue_empty &&
                                          all_writes_nvm);
diff --git a/src/python/gem5/components/memory/hbm.py b/src/python/gem5/components/memory/hbm.py
index 35497c2f89..75db1f9fde 100644
--- a/src/python/gem5/components/memory/hbm.py
+++ b/src/python/gem5/components/memory/hbm.py
@@ -122,7 +122,6 @@ def _interleave_addresses(self):
         # for interleaving across pseudo channels (at 64B currently)
         mask_list.insert(0, 1 << 6)
         for i, ctrl in enumerate(self.mem_ctrl):
-            ctrl.partitioned_q = False
             ctrl.dram.range = AddrRange(
                 start=self._mem_range.start,
                 size=self._mem_range.size(),

From 5c6a9ba95eadb86f0b23b8562ba6f37180d92fee Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Oct 2022 14:10:57 -0700
Subject: [PATCH 249/287] Fixing done, code style and conifg. Adding a stat.

---
 configs/accl/sega-simple.py | 165 ++++++++++++++++++++++++++++++++++++
 1 file changed, 165 insertions(+)
 create mode 100644 configs/accl/sega-simple.py

diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
new file mode 100644
index 0000000000..54a90281bf
--- /dev/null
+++ b/configs/accl/sega-simple.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+import m5
+import argparse
+
+from math import log
+from m5.objects import *
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+        intlv_low_bit = log(cache_line_size, 2)
+        intlv_bits = log(num_channels, 2)
+        ret = []
+        for i in range(num_channels):
+            ret.append(AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i))
+        return ret
+
+class GPT(SubSystem):
+    def __init__(self, edge_memory_size: str, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+                                update_queue_size=128,
+                                register_file_size=64
+                                )
+        self.coalesce_engine = CoalesceEngine(
+                                            attached_memory_atom_size=32,
+                                            cache_size=cache_size,
+                                            num_mshr_entry=64,
+                                            num_tgts_per_mshr=64,
+                                            max_resp_per_cycle=8
+                                            )
+        self.push_engine = PushEngine(
+                                    push_req_queue_size=32,
+                                    attached_memory_atom_size=64,
+                                    resp_queue_size=64,
+                                    update_queue_size=16,
+                                    )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+                                        latency="0ns",
+                                        latency_var="0ns",
+                                        bandwidth="0GB/s"
+                                        )
+
+        self.edge_mem_ctrl = SimpleMemory(
+                                        latency="30ns",
+                                        latency_var="0ns",
+                                        bandwidth="32GB/s",
+                                        range=AddrRange(edge_memory_size),
+                                        in_addr_map=False
+                                        )
+
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+        self.push_engine.mem_port = self.edge_mem_ctrl.port
+
+        self.mpu = MPU(
+                    wl_engine=self.wl_engine,
+                    coalesce_engine=self.coalesce_engine,
+                    push_engine=self.push_engine
+                    )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+    def set_edge_image(self, edge_image):
+        self.edge_mem_ctrl.image_file = edge_image
+
+class SEGA(System):
+    def __init__(self, num_mpus, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = '2GHz'
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
+
+        vertex_ranges = interleave_addresses(
+                                        AddrRange(start=0, size="4GiB"),
+                                        num_mpus,
+                                        32
+                                        )
+
+        gpts = []
+        for i in range(num_mpus):
+            gpt = GPT("8GiB", cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
+            gpts.append(gpt)
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        self.gpts = gpts
+
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def create_initial_bfs_update(self, init_addr, init_value):
+        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
+
+def get_inputs():
+    argparser = argparse.ArgumentParser()
+    argparser.add_argument("num_gpts", type=int)
+    argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("graph", type=str)
+    argparser.add_argument("init_addr", type=int)
+    argparser.add_argument("init_value", type=int)
+
+    args = argparser.parse_args()
+
+    return args.num_gpts, args.cache_size, \
+        args.graph, args.init_addr, args.init_value
+
+if __name__ == "__m5_main__":
+    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
+
+    system = SEGA(num_gpts, cache_size, graph)
+    root = Root(full_system = False, system = system)
+
+    m5.instantiate()
+
+    system.create_initial_bfs_update(init_addr, init_value)
+
+    exit_event = m5.simulate()
+    print(f"Exited simulation at tick {m5.curTick()} " + \
+            f"because {exit_event.getCause()}")

From 70e27e9e12588481b3cfe8849899f6a61191cab9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 16 Jan 2023 12:29:36 -0800
Subject: [PATCH 250/287] Initial commit for router

---
 configs/accl/bfs.py                  |  15 +-
 configs/accl/sega_simple_pt2pt.py    | 213 +++++++++++++++++++++
 src/accl/graph/sega/RouterEngine.py  |  56 ++++++
 src/accl/graph/sega/SConscript       |   2 +
 src/accl/graph/sega/router_engine.cc | 264 +++++++++++++++++++++++++++
 src/accl/graph/sega/router_engine.hh | 188 +++++++++++++++++++
 6 files changed, 737 insertions(+), 1 deletion(-)
 create mode 100644 configs/accl/sega_simple_pt2pt.py
 create mode 100644 src/accl/graph/sega/RouterEngine.py
 create mode 100644 src/accl/graph/sega/router_engine.cc
 create mode 100644 src/accl/graph/sega/router_engine.hh

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 97f1b5dc21..0b63088bff 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -55,6 +55,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -83,6 +91,7 @@ def get_inputs():
         args.init_value,
         args.visited,
         args.simple,
+        args.pt2pt,
         args.sample,
         args.verify,
     )
@@ -98,12 +107,16 @@ def get_inputs():
         init_value,
         visited,
         simple,
+        pt2pt,
         sample,
         verify,
     ) = get_inputs()
 
     if simple:
-        from sega_simple import SEGA
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+        else:
+            from sega_simple import SEGA
     else:
         from sega import SEGA
     system = SEGA(num_gpts, num_registers, cache_size, graph)
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
new file mode 100644
index 0000000000..1ccda2d85b
--- /dev/null
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -0,0 +1,213 @@
+# Copyright (c) 2022 The Regents of the University of California
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from math import log
+from m5.objects import *
+
+
+def interleave_addresses(plain_range, num_channels, cache_line_size):
+    intlv_low_bit = log(cache_line_size, 2)
+    intlv_bits = log(num_channels, 2)
+    ret = []
+    for i in range(num_channels):
+        ret.append(
+            AddrRange(
+                start=plain_range.start,
+                size=plain_range.size(),
+                intlvHighBit=intlv_low_bit + intlv_bits - 1,
+                xorHighBit=0,
+                intlvBits=intlv_bits,
+                intlvMatch=i,
+            )
+        )
+    return ret
+
+
+class GPT(SubSystem):
+    def __init__(self, register_file_size: int, cache_size: str):
+        super().__init__()
+        self.wl_engine = WLEngine(
+            update_queue_size=64, register_file_size=register_file_size
+        )
+        self.coalesce_engine = CoalesceEngine(
+            attached_memory_atom_size=32,
+            cache_size=cache_size,
+            max_resp_per_cycle=8,
+            pending_pull_limit=64,
+            active_buffer_size=80,
+            post_push_wb_queue_size=64,
+        )
+        self.push_engine = PushEngine(
+            push_req_queue_size=32,
+            attached_memory_atom_size=64,
+            resp_queue_size=4096,
+            max_propagates_per_cycle=8,
+            update_queue_size=32,
+        )
+
+        self.vertex_mem_ctrl = SimpleMemory(
+            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+        )
+        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
+
+        self.mpu = MPU(
+            wl_engine=self.wl_engine,
+            coalesce_engine=self.coalesce_engine,
+            push_engine=self.push_engine,
+        )
+
+    def getRespPort(self):
+        return self.wl_engine.in_ports
+
+    def setRespPort(self, port):
+        self.wl_engine.in_ports = port
+
+    def getReqPort(self):
+        return self.push_engine.out_ports
+
+    def setReqPort(self, port):
+        self.push_engine.out_ports = port
+
+    def getEdgeMemPort(self):
+        return self.push_engine.mem_port
+
+    def setEdgeMemPort(self, port):
+        self.push_engine.mem_port = port
+
+    def set_vertex_range(self, vertex_range):
+        self.vertex_mem_ctrl.range = vertex_range
+
+
+class EdgeMemory(SubSystem):
+    def __init__(self, size: str):
+        super(EdgeMemory, self).__init__()
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2.4GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+
+        self.mem_ctrl = MemCtrl(
+            dram=DDR4_2400_8x8(range=AddrRange(size), in_addr_map=False)
+        )
+        self.xbar = NoncoherentXBar(
+            width=64, frontend_latency=1, forward_latency=1, response_latency=1
+        )
+        self.xbar.mem_side_ports = self.mem_ctrl.port
+
+    def set_image(self, image):
+        self.mem_ctrl.dram.image_file = image
+
+    def getPort(self):
+        return self.xbar.cpu_side_ports
+
+    def setPort(self, port):
+        self.xbar.cpu_side_ports = port
+
+
+class SEGA(System):
+    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+        super(SEGA, self).__init__()
+        # num_gpts should be an even power of 2
+        assert num_gpts != 0
+        assert num_gpts % 2 == 0
+        assert (num_gpts & (num_gpts - 1)) == 0
+
+        self.clk_domain = SrcClockDomain()
+        self.clk_domain.clock = "2GHz"
+        self.clk_domain.voltage_domain = VoltageDomain()
+        self.cache_line_size = 32
+        self.mem_mode = "timing"
+
+        # Building the CenteralController
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
+        # Building the EdgeMemories
+        edge_mem = []
+        for i in range(int(num_gpts / 2)):
+            mem = EdgeMemory("16GiB")
+            mem.set_image(f"{graph_path}/edgelist_{i}")
+            edge_mem.append(mem)
+        self.edge_mem = edge_mem
+        # Building the GPTs
+        vertex_ranges = interleave_addresses(
+            AddrRange(start=0, size="4GiB"), num_gpts, 32
+        )
+        gpts = []
+        routers = []
+        for i in range(num_gpts):
+            gpt = GPT(num_registers, cache_size)
+            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
+            gpts.append(gpt)
+            routers.append(RouterEngine())
+        self.routers = routers
+        print(len(gpts))
+        # self.router = RouterEngine()
+
+        # Creating the interconnect among mpus
+        for gpt_0 in gpts:
+            # gpt_0.setRespPort(self.router.gpt_req_side)
+            for gpt_1 in gpts:
+                gpt_0.setReqPort(gpt_1.getRespPort())
+        for i in range(len(gpts)):
+            gpts[i].setRespPort(routers[i].gpt_req_side)
+        self.gpts = gpts
+        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+
+    def work_count(self):
+        return self.ctrl.workCount()
+
+    def set_async_mode(self):
+        self.ctrl.setAsyncMode()
+
+    def set_bsp_mode(self):
+        self.ctrl.setBSPMode()
+
+    def create_pop_count_directory(self, atoms_per_block):
+        self.ctrl.createPopCountDirectory(atoms_per_block)
+
+    def create_bfs_workload(self, init_addr, init_value):
+        self.ctrl.createBFSWorkload(init_addr, init_value)
+
+    def create_bfs_visited_workload(self, init_addr, init_value):
+        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+
+    def create_sssp_workload(self, init_addr, init_value):
+        self.ctrl.createSSSPWorkload(init_addr, init_value)
+
+    def create_cc_workload(self):
+        self.ctrl.createCCWorkload()
+
+    def create_pr_workload(self, alpha):
+        self.ctrl.createPRWorkload(alpha)
+
+    def create_bc_workload(self, init_addr, init_value):
+        self.ctrl.createBCWorkload(init_addr, init_value)
+
+    def print_answer(self):
+        self.ctrl.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
new file mode 100644
index 0000000000..d232d95923
--- /dev/null
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -0,0 +1,56 @@
+# -*- coding: utf-8 -*-
+# Copyright (c) 2017 Jason Lowe-Power
+# All rights reserved.
+#
+# Redistribution and use in source and binary forms, with or without
+# modification, are permitted provided that the following conditions are
+# met: redistributions of source code must retain the above copyright
+# notice, this list of conditions and the following disclaimer;
+# redistributions in binary form must reproduce the above copyright
+# notice, this list of conditions and the following disclaimer in the
+# documentation and/or other materials provided with the distribution;
+# neither the name of the copyright holders nor the names of its
+# contributors may be used to endorse or promote products derived from
+# this software without specific prior written permission.
+#
+# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+from m5.params import *
+from m5.proxy import *
+from m5.objects.ClockedObject import ClockedObject
+
+class RouterEngine(ClockedObject):
+    type = "RouterEngine"
+    cxx_header = "accl/graph/sega/router_engine.hh"
+    cxx_class = "gem5::RouterEngine"
+
+    # push_req_queue_size = Param.Int("Size of the queue to "
+    #                                 "queue push requests.")
+    # # resp_queue_size should probably be
+    # # significantly bigger than push_req_queue_size
+    # resp_queue_size = Param.Int("Size of the response queue in the "
+    #                                 "push engine where it stores the "
+    #                                 "edges read from memory.")
+
+    # max_propagates_per_cycle = Param.Int("Maximum number of propagates "
+    #                                                     "done per cycle.")
+
+    # update_queue_size = Param.Int("Maximum number of entries "
+    #                                 "for each update queue.")
+
+    gpt_req_side = VectorRequestPort("Outgoing ports to local GPTs")
+    gpt_resp_side = VectorRequestPort("incoming ports from local GPTs")
+    
+    gpn_req_side = VectorRequestPort("Outgoing ports to remote GPNs")
+    gpn_resp_side = VectorRequestPort("incoming ports from local GPNs")
+    # remote_resp_side = VectorRsponsePort("Incoming ports from GPNs to router")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index b3e1a838fb..a7d9096cca 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -33,6 +33,7 @@ SimObject("CoalesceEngine.py", sim_objects=["CoalesceEngine"])
 SimObject("MPU.py", sim_objects=["MPU"])
 SimObject("PushEngine.py", sim_objects=["PushEngine"])
 SimObject("WLEngine.py", sim_objects=["WLEngine"])
+SimObject("RouterEngine.py", sim_objects=["RouterEngine"])
 
 Source("base_memory_engine.cc")
 Source("centeral_controller.cc")
@@ -41,6 +42,7 @@ Source("enums.cc")
 Source("mpu.cc")
 Source("push_engine.cc")
 Source("wl_engine.cc")
+Source("router_engine.cc")
 
 DebugFlag("BaseMemoryEngine")
 DebugFlag("CenteralController")
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
new file mode 100644
index 0000000000..cf9de36ff1
--- /dev/null
+++ b/src/accl/graph/sega/router_engine.cc
@@ -0,0 +1,264 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include "accl/graph/sega/router_engine.hh"
+
+namespace gem5
+{
+RouterEngine::RouterEngine(const Params &params):
+  ClockedObject(params)
+{
+
+    for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
+        gptReqPorts.emplace_back(
+                    name() + ".gpt_req_side" + std::to_string(i), this, i);
+    }
+
+    for (int i = 0; i < params.port_gpt_resp_side_connection_count; ++i) {
+        gptRespPorts.emplace_back(
+                    name() + ".gpt_resp_side" + std::to_string(i), this, i);
+    }
+
+    for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
+        gpnReqPorts.emplace_back(
+                    name() + ".gpn_req_side" + std::to_string(i), this, i);
+    }
+
+    for (int i = 0; i < params.port_gpn_resp_side_connection_count; ++i) {
+        gpnRespPorts.emplace_back(
+                    name() + ".gpn_resp_side" + std::to_string(i), this, i);
+    }
+}
+
+AddrRangeList
+RouterEngine::RouterRespPort::getAddrRanges() const
+{
+    return owner->getGPNRanges();
+}
+
+AddrRangeList
+RouterEngine::InternalRespPort::getAddrRanges() const
+{
+    return owner->getGPTRanges();
+}
+
+AddrRangeList
+RouterEngine::getGPNRanges()
+{
+    AddrRangeList ret;
+    for (auto &gpnPort : gpnReqPorts){
+        for (auto &addr_range : gpnPort.getAddrRanges()) {
+            ret.push_back(addr_range);
+        }
+    }
+    // for(auto i = routerAddrMap.begin(); i != routerAddrMap.end(); ++i) {
+    //     ret.push_back(i->second);
+    // }
+    return ret;
+}
+
+void
+RouterEngine::init()
+{
+    for (int i = 0; i < gptReqPorts.size(); i++) {
+        gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
+        for (auto &addrRange: gptReqPorts[i].getAddrRanges()) {
+            std::cout<< __func__<<addrRange.to_string()<<std::endl;
+        }
+    }
+}
+
+void
+RouterEngine::startup()
+{
+    for (int i = 0; i < gpnReqPorts.size(); i++) {
+        routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
+        for (auto &addrRange: gpnReqPorts[i].getAddrRanges()){
+            std::cout<< __func__<<addrRange.to_string()<<std::endl;
+        }
+    }
+}
+
+Port&
+RouterEngine::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "gpt_req_side") {
+        return gptReqPorts[idx];
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
+bool
+RouterEngine::RouterReqPort::recvTimingResp(PacketPtr pkt) {
+    panic("Not implemented yet!");
+    return 0;
+}
+
+void
+RouterEngine::RouterReqPort::recvReqRetry() {
+    panic("Not implemented yet!");
+}
+
+bool
+RouterEngine::InternalReqPort::recvTimingResp(PacketPtr pkt) {
+    panic("Not implemented yet!");
+    return 0;
+}
+
+void
+RouterEngine::InternalReqPort::recvReqRetry() {
+    panic("Not implemented yet!");
+}
+
+Tick 
+RouterEngine::RouterRespPort::recvAtomic(PacketPtr pkt){
+    panic("Not implemented yet!");
+}
+
+bool 
+RouterEngine::RouterRespPort::recvTimingReq(PacketPtr pkt){
+    panic("Not implemented yet!");
+}
+
+void 
+RouterEngine::RouterRespPort::recvFunctional(PacketPtr pkt){
+    panic("Not implemented yet!");
+}
+
+void 
+RouterEngine::RouterRespPort::recvRespRetry(){
+    panic("Not implemented yet!");
+}
+
+// bool
+// Router::handleRemoteRequest(PortID portId, PacketPtr pkt)
+// {
+//     auto queue = find_if(remoteReqQueues.begin(), remoteReqQueues.end(),
+//         [portId](RequestQueue &obj){return obj.cpuPortId == portId;});
+//     if (queue->blocked()){
+//         queue->sendReadRetry = true;
+//         return false;
+//     }
+    
+//     queue->push(pkt);
+//     if (!nextRemoteReqEvent.scheduled()){
+//         schedule(nextReqEvent, nextCycle());
+//     }
+//     return true;
+// }
+
+
+// void
+// Router::processNextRemoteReqEvent(){
+//     RequestQueue *queue = NULL;
+//     for (auto &it : remoteReqQueues) {
+//         if (!it.emptyRead()){
+//             queue = &it;
+//             break;
+//         }
+//     }
+
+//     if (queue == nullptr){
+//         return;
+//     }
+
+//     PacketPtr pkt;
+//     std::vector<MemSidePort>::iterator memPort;
+//     while (true){
+        
+//         pkt = queue->readQueue.front();
+//         AddrRange addr_range = pkt->getAddrRange();
+        
+//         PortID localRespPortID = memPortMap.contains(addr_range)->second;
+        
+//         localRespPort = find_if(localRespPorts.begin(), localRespPorts.end(),
+//             [memPortId](LocalRespPort &obj)
+//             {return obj.portId() == localRespPortID;});
+
+//         if (!localRespPort->blocked()){
+//             break;
+//         }
+//         else {
+        
+//         }
+//     }
+
+//     DPRINTF(MemScheduler, "processNextReqEvent: "
+//         "Port not blocked! Sending the packet\n");
+//     PortID cpuPortId = pick->cpuPortId;
+
+//     memPort->sendPacket(pkt);
+//     pick->timesChecked++;
+
+    
+        
+//     entryTimes.erase(pkt);
+//     pick->readQueue.pop();
+
+//     if (!nextReqEvent.scheduled()){
+//         for (auto &queue : requestQueues){
+//             if (!queue.emptyRead() || queue.serviceWrite()){
+//                 DPRINTF(MemScheduler, "processNextReqEvent: "
+//                     "Scheduling nextReqEvent in processNextReqEvent\n");
+//                 schedule(nextReqEvent, nextCycle());
+//                 break;
+//             }
+//         }
+//     }
+
+    
+//     if (pick->sendReadRetry && !pick->blocked(pkt->isRead()
+//         || pkt->isWrite())){
+//         PortID cpuPortId = pick->cpuPortId;
+//         auto cpuPort = find_if(cpuPorts.begin(), cpuPorts.end(),
+//             [cpuPortId](CPUSidePort &obj)
+//             {return obj.portId() == cpuPortId;});
+//         DPRINTF(MemScheduler, "processNextReqEvent: "
+//         "Sending read retry to ports previously blocked\n");
+//         cpuPort->trySendRetry();
+//         pick->sendReadRetry = false;
+//     }
+
+//     return;
+// }
+
+// void Router::recvRangeChange(PortID portId)
+// {
+//     for (auto &port : localRespPorts){
+//         if (port.portId() == portId){
+//             AddrRangeList ranges = port.getAddrRanges();
+//             for (auto &r : ranges){
+//                 localPortMap.insert(r, portId);
+//             }
+//         }
+//     }
+//     sendRangeChange();
+// }
+
+}// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
new file mode 100644
index 0000000000..ebc464bb36
--- /dev/null
+++ b/src/accl/graph/sega/router_engine.hh
@@ -0,0 +1,188 @@
+/*
+ * Copyright (c) 2021 The Regents of the University of California.
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are
+ * met: redistributions of source code must retain the above copyright
+ * notice, this list of conditions and the following disclaimer;
+ * redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution;
+ * neither the name of the copyright holders nor the names of its
+ * contributors may be used to endorse or promote products derived from
+ * this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
+#define __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
+
+#include "params/RouterEngine.hh"
+#include "sim/clocked_object.hh"
+#include "mem/packet.hh"
+#include "mem/port.hh"
+
+namespace gem5
+{
+
+class RouterEngine : public ClockedObject
+{
+  private:
+    class RouterReqPort : public RequestPort
+    {
+      private:
+        RouterEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        RouterReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class InternalReqPort : public RequestPort
+    {
+      private:
+        RouterEngine* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        InternalReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+        PortID id() { return _id; }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    class RouterRespPort : public ResponsePort
+    {
+      private:
+        RouterEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        RouterRespPort(const std::string& name, RouterEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+    class InternalRespPort : public RouterRespPort
+    {
+      private:
+        RouterEngine* owner;
+        bool needSendRetryReq;
+        PortID _id;
+
+      public:
+        InternalRespPort(const std::string& name, RouterEngine* owner, PortID id):
+          ResponsePort(name, owner),
+          owner(owner), needSendRetryReq(false), _id(id)
+        {}
+        virtual AddrRangeList getAddrRanges() const;
+
+        PortID id() { return _id; }
+        void checkRetryReq();
+
+      protected:
+        virtual bool recvTimingReq(PacketPtr pkt);
+        virtual Tick recvAtomic(PacketPtr pkt);
+        virtual void recvFunctional(PacketPtr pkt);
+        virtual void recvRespRetry();
+    };
+
+//     struct RequestQueue {
+//     std::queue<PacketPtr> reqQ;
+//     const uint32_t reqQSize;
+//     const PortID PortId;
+
+//     bool blocked() {
+//         return reqQ.size() == reqQSize;
+//     }
+
+//     void push(PacketPtr pkt) {
+//         reqQ.push(pkt);
+//     }
+
+//     bool emptyQ(){
+//         return reqQ.empty();
+//       }
+      
+//     RequestQueue(uint32_t reqQSize, PortID portId):
+//     reqQSize(reqQSize),
+//     portId(portId) {}
+//   };
+
+//   std::vector<RequestQueue> remoteReqQueues;
+  
+  std::vector<RouterReqPort> gptReqPorts;
+  std::vector<RouterRespPort> gptRespPorts;
+
+  std::vector<InternalReqPort> gpnReqPorts;
+  std::vector<InternalRespPort> gpnRespPorts;
+
+  std::unordered_map<PortID, AddrRangeList> gptAddrMap;
+  std::unordered_map<PortID, AddrRangeList> routerAddrMap;
+
+
+  public:
+    PARAMS(RouterEngine);
+    RouterEngine(const Params &params);
+
+    virtual void init() override;
+    virtual void startup() override;
+
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+
+    AddrRangeList getGPNRanges();
+    AddrRangeList getGPTRanges();
+
+//   std::unordered_map<PortID, std::vector<AddrRangeList>> routerPortAddrMap;
+
+//   AddrRangeMap<PortID, 0> localPortMap;
+
+};
+
+}
+
+#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__

From 8649642c5ed693ba9e678b05d2fb0e88525f0006 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 16 Jan 2023 17:48:53 -0800
Subject: [PATCH 251/287] Created a path from GPN to Router and remote router

---
 configs/accl/sega_simple_pt2pt.py    |   4 +
 src/accl/graph/sega/RouterEngine.py  |   6 +-
 src/accl/graph/sega/router_engine.cc | 206 +++++++++++++++++++++++----
 src/accl/graph/sega/router_engine.hh |  60 ++++----
 4 files changed, 209 insertions(+), 67 deletions(-)

diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index 1ccda2d85b..2c75ea4896 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -176,6 +176,10 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
                 gpt_0.setReqPort(gpt_1.getRespPort())
         for i in range(len(gpts)):
             gpts[i].setRespPort(routers[i].gpt_req_side)
+        for r_0 in routers:
+            for r_1 in routers:
+                if r_0 != r_1:
+                    r_0.gpn_resp_side = r_1.gpn_req_side
         self.gpts = gpts
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
 
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
index d232d95923..9fed906b83 100644
--- a/src/accl/graph/sega/RouterEngine.py
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -49,8 +49,10 @@ class RouterEngine(ClockedObject):
     #                                 "for each update queue.")
 
     gpt_req_side = VectorRequestPort("Outgoing ports to local GPTs")
-    gpt_resp_side = VectorRequestPort("incoming ports from local GPTs")
+    gpt_resp_side = VectorResponsePort("incoming ports from local GPTs")
     
     gpn_req_side = VectorRequestPort("Outgoing ports to remote GPNs")
-    gpn_resp_side = VectorRequestPort("incoming ports from local GPNs")
+    gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
+    gpt_queue_size = Param.Int(8, "Queue size on the gpt side")
+    gpn_queue_size = Param.Int(8, "Queue size on the gpt side")
     # remote_resp_side = VectorRsponsePort("Incoming ports from GPNs to router")
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index cf9de36ff1..c0866618b5 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -31,7 +31,11 @@
 namespace gem5
 {
 RouterEngine::RouterEngine(const Params &params):
-  ClockedObject(params)
+  ClockedObject(params),
+  gptQSize(params.gpt_queue_size),
+  gpnQSize(params.gpn_queue_size),
+  nextInteralGPTGPNEvent([this] { processNextInteralGPTGPNEvent(); }, name()),
+  nextRemoteGPTGPNEvent([this] { processNextRemoteGPTGPNEvent(); }, name())
 {
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
@@ -56,13 +60,13 @@ RouterEngine::RouterEngine(const Params &params):
 }
 
 AddrRangeList
-RouterEngine::RouterRespPort::getAddrRanges() const
+RouterEngine::GPTRespPort::getAddrRanges() const
 {
     return owner->getGPNRanges();
 }
 
 AddrRangeList
-RouterEngine::InternalRespPort::getAddrRanges() const
+RouterEngine::GPNRespPort::getAddrRanges() const
 {
     return owner->getGPTRanges();
 }
@@ -71,7 +75,7 @@ AddrRangeList
 RouterEngine::getGPNRanges()
 {
     AddrRangeList ret;
-    for (auto &gpnPort : gpnReqPorts){
+    for (auto &gpnPort : gpnReqPorts) {
         for (auto &addr_range : gpnPort.getAddrRanges()) {
             ret.push_back(addr_range);
         }
@@ -82,13 +86,25 @@ RouterEngine::getGPNRanges()
     return ret;
 }
 
+AddrRangeList
+RouterEngine::getGPTRanges()
+{
+    AddrRangeList ret;
+    for (auto &gptPort : gptReqPorts) {
+        for (auto &addr_range : gptPort.getAddrRanges()) {
+            ret.push_back(addr_range);
+        }
+    }
+    return ret;
+}
+
 void
 RouterEngine::init()
 {
     for (int i = 0; i < gptReqPorts.size(); i++) {
         gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
         for (auto &addrRange: gptReqPorts[i].getAddrRanges()) {
-            std::cout<< __func__<<addrRange.to_string()<<std::endl;
+            std::cout<< name()<<", "<<__func__<<addrRange.to_string()<<std::endl;
         }
     }
 }
@@ -98,10 +114,11 @@ RouterEngine::startup()
 {
     for (int i = 0; i < gpnReqPorts.size(); i++) {
         routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
-        for (auto &addrRange: gpnReqPorts[i].getAddrRanges()){
-            std::cout<< __func__<<addrRange.to_string()<<std::endl;
+        for (auto &addrRange: gpnReqPorts[i].getAddrRanges()) {
+            std::cout<< name()<<", "<<__func__<<addrRange.to_string()<<std::endl;
         }
     }
+    std::cout<<"******************"<<std::endl;
 }
 
 Port&
@@ -109,50 +126,179 @@ RouterEngine::getPort(const std::string& if_name, PortID idx)
 {
     if (if_name == "gpt_req_side") {
         return gptReqPorts[idx];
+    } else if (if_name == "gpt_resp_side") { 
+        return gptRespPorts[idx];
+    } else if (if_name == "gpn_req_side") {
+        return gpnReqPorts[idx];
+    } else if (if_name == "gpn_resp_side") {
+        return gpnRespPorts[idx];
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
 }
 
 bool
-RouterEngine::RouterReqPort::recvTimingResp(PacketPtr pkt) {
+RouterEngine::GPTReqPort::recvTimingResp(PacketPtr pkt) {
     panic("Not implemented yet!");
     return 0;
 }
 
 void
-RouterEngine::RouterReqPort::recvReqRetry() {
+RouterEngine::GPTReqPort::recvReqRetry() {
     panic("Not implemented yet!");
 }
 
 bool
-RouterEngine::InternalReqPort::recvTimingResp(PacketPtr pkt) {
+RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt) {
     panic("Not implemented yet!");
     return 0;
 }
 
 void
-RouterEngine::InternalReqPort::recvReqRetry() {
+RouterEngine::GPNReqPort::recvReqRetry() {
+    panic("Not implemented yet!");
+}
+
+void
+RouterEngine::GPNReqPort::sendPacket(PacketPtr pkt) {
+    panic_if(blocked(), "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+    }
+}
+
+Tick 
+RouterEngine::GPTRespPort::recvAtomic(PacketPtr pkt) {
+    panic("Not implemented yet!");
+}
+
+bool 
+RouterEngine::GPTRespPort::recvTimingReq(PacketPtr pkt) {
+    if (!owner->handleRequest(id(), pkt)) {
+        return false;
+    }
+    return true;
+}
+
+bool
+RouterEngine::handleRequest(PortID portId, PacketPtr pkt)
+{
+    auto queue = gptReqQueues[portId];
+    bool accepted = false;
+    if (queue.size() < gptQSize) {
+        gptReqQueues[portId].push(pkt);
+        accepted = true;
+    } else {
+        accepted = false;
+    }
+
+    if(accepted && (!nextInteralGPTGPNEvent.scheduled())) {
+        schedule(nextInteralGPTGPNEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+void
+RouterEngine::processNextInteralGPTGPNEvent()
+{
+    bool found = false;
+    int queues_none_empty = 0;
+    for (auto queue = gptReqQueues.begin(); 
+              queue != gptReqQueues.end(); ++queue) {
+        if (!queue->second.empty()) {
+            PacketPtr pkt = queue->second.front();
+            Addr pkt_addr = pkt->getAddr();
+            queues_none_empty += 1;
+            for (int i = 0; i < gpnReqPorts.size(); i++) {
+                AddrRangeList addr_list = routerAddrMap[gpnReqPorts[i].id()];
+                if ((contains(addr_list, pkt_addr))) {
+                    if (gpnRespQueues[gpnReqPorts[i].id()].size() < gpnQSize) {
+                        gpnRespQueues[gpnReqPorts[i].id()].push(pkt);
+                        queue->second.pop();
+                        found = true;
+                        queues_none_empty -= 1;
+                        if (!queue->second.empty()) {
+                            queues_none_empty += 1;
+                        }
+                        if ((!nextRemoteGPTGPNEvent.scheduled())) {
+                            schedule(nextRemoteGPTGPNEvent, nextCycle());
+                        }
+                        break;
+                    // queue is full
+                    } else {
+                        found = false;
+                        break;
+                    }
+                }
+            }
+        }
+        if (found) {
+            break;
+        }
+    }
+
+    if (queues_none_empty > 0) {
+        schedule(nextInteralGPTGPNEvent, nextCycle());
+    }
+}
+
+void
+RouterEngine::processNextRemoteGPTGPNEvent()
+{
+    uint32_t none_empty_queue = 0;  
+    for (auto queue = gpnRespQueues.begin(); 
+              queue != gpnRespQueues.end(); ++queue) {
+        if (!queue->second.empty()) {
+            if (!gpnReqPorts[queue->first].blocked()) {
+                PacketPtr pkt = queue->second.front();
+                gpnReqPorts[queue->first].sendPacket(pkt);
+                queue->second.pop();
+                break;
+            }
+        }
+    }
+
+    for (auto queue = gpnRespQueues.begin(); 
+              queue != gpnRespQueues.end(); ++queue) {
+        if (!queue->second.empty()) {
+            none_empty_queue += 1;
+        }
+    }
+
+    if (none_empty_queue > 0) {
+        schedule(nextRemoteGPTGPNEvent, nextCycle());
+    }
+}
+
+void 
+RouterEngine::GPTRespPort::recvFunctional(PacketPtr pkt) {
+    panic("Not implemented yet!");
+}
+
+void 
+RouterEngine::GPTRespPort::recvRespRetry() {
     panic("Not implemented yet!");
 }
 
 Tick 
-RouterEngine::RouterRespPort::recvAtomic(PacketPtr pkt){
+RouterEngine::GPNRespPort::recvAtomic(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
 bool 
-RouterEngine::RouterRespPort::recvTimingReq(PacketPtr pkt){
+RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
 void 
-RouterEngine::RouterRespPort::recvFunctional(PacketPtr pkt){
+RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
 void 
-RouterEngine::RouterRespPort::recvRespRetry(){
+RouterEngine::GPNRespPort::recvRespRetry() {
     panic("Not implemented yet!");
 }
 
@@ -160,14 +306,14 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 // Router::handleRemoteRequest(PortID portId, PacketPtr pkt)
 // {
 //     auto queue = find_if(remoteReqQueues.begin(), remoteReqQueues.end(),
-//         [portId](RequestQueue &obj){return obj.cpuPortId == portId;});
-//     if (queue->blocked()){
+//         [portId](RequestQueue &obj) {return obj.cpuPortId == portId;});
+//     if (queue->blocked()) {
 //         queue->sendReadRetry = true;
 //         return false;
 //     }
     
 //     queue->push(pkt);
-//     if (!nextRemoteReqEvent.scheduled()){
+//     if (!nextRemoteReqEvent.scheduled()) {
 //         schedule(nextReqEvent, nextCycle());
 //     }
 //     return true;
@@ -175,22 +321,22 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 
 
 // void
-// Router::processNextRemoteReqEvent(){
+// Router::processNextRemoteReqEvent() {
 //     RequestQueue *queue = NULL;
 //     for (auto &it : remoteReqQueues) {
-//         if (!it.emptyRead()){
+//         if (!it.emptyRead()) {
 //             queue = &it;
 //             break;
 //         }
 //     }
 
-//     if (queue == nullptr){
+//     if (queue == nullptr) {
 //         return;
 //     }
 
 //     PacketPtr pkt;
 //     std::vector<MemSidePort>::iterator memPort;
-//     while (true){
+//     while (true) {
         
 //         pkt = queue->readQueue.front();
 //         AddrRange addr_range = pkt->getAddrRange();
@@ -201,7 +347,7 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 //             [memPortId](LocalRespPort &obj)
 //             {return obj.portId() == localRespPortID;});
 
-//         if (!localRespPort->blocked()){
+//         if (!localRespPort->blocked()) {
 //             break;
 //         }
 //         else {
@@ -221,9 +367,9 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 //     entryTimes.erase(pkt);
 //     pick->readQueue.pop();
 
-//     if (!nextReqEvent.scheduled()){
-//         for (auto &queue : requestQueues){
-//             if (!queue.emptyRead() || queue.serviceWrite()){
+//     if (!nextReqEvent.scheduled()) {
+//         for (auto &queue : requestQueues) {
+//             if (!queue.emptyRead() || queue.serviceWrite()) {
 //                 DPRINTF(MemScheduler, "processNextReqEvent: "
 //                     "Scheduling nextReqEvent in processNextReqEvent\n");
 //                 schedule(nextReqEvent, nextCycle());
@@ -234,7 +380,7 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 
     
 //     if (pick->sendReadRetry && !pick->blocked(pkt->isRead()
-//         || pkt->isWrite())){
+//         || pkt->isWrite())) {
 //         PortID cpuPortId = pick->cpuPortId;
 //         auto cpuPort = find_if(cpuPorts.begin(), cpuPorts.end(),
 //             [cpuPortId](CPUSidePort &obj)
@@ -250,10 +396,10 @@ RouterEngine::RouterRespPort::recvRespRetry(){
 
 // void Router::recvRangeChange(PortID portId)
 // {
-//     for (auto &port : localRespPorts){
-//         if (port.portId() == portId){
+//     for (auto &port : localRespPorts) {
+//         if (port.portId() == portId) {
 //             AddrRangeList ranges = port.getAddrRanges();
-//             for (auto &r : ranges){
+//             for (auto &r : ranges) {
 //                 localPortMap.insert(r, portId);
 //             }
 //         }
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index ebc464bb36..413dc21bc4 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -34,13 +34,15 @@
 #include "mem/packet.hh"
 #include "mem/port.hh"
 
+#include <queue>
+
 namespace gem5
 {
 
 class RouterEngine : public ClockedObject
 {
   private:
-    class RouterReqPort : public RequestPort
+    class GPTReqPort : public RequestPort
     {
       private:
         RouterEngine* owner;
@@ -48,7 +50,7 @@ class RouterEngine : public ClockedObject
         PortID _id;
 
       public:
-        RouterReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+        GPTReqPort(const std::string& name, RouterEngine* owner, PortID id) :
           RequestPort(name, owner),
           owner(owner), blockedPacket(nullptr), _id(id)
         {}
@@ -61,7 +63,7 @@ class RouterEngine : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    class InternalReqPort : public RequestPort
+    class GPNReqPort : public RequestPort
     {
       private:
         RouterEngine* owner;
@@ -69,7 +71,7 @@ class RouterEngine : public ClockedObject
         PortID _id;
 
       public:
-        InternalReqPort(const std::string& name, RouterEngine* owner, PortID id) :
+        GPNReqPort(const std::string& name, RouterEngine* owner, PortID id) :
           RequestPort(name, owner),
           owner(owner), blockedPacket(nullptr), _id(id)
         {}
@@ -82,7 +84,7 @@ class RouterEngine : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    class RouterRespPort : public ResponsePort
+    class GPTRespPort : public ResponsePort
     {
       private:
         RouterEngine* owner;
@@ -90,7 +92,7 @@ class RouterEngine : public ClockedObject
         PortID _id;
 
       public:
-        RouterRespPort(const std::string& name, RouterEngine* owner, PortID id):
+        GPTRespPort(const std::string& name, RouterEngine* owner, PortID id):
           ResponsePort(name, owner),
           owner(owner), needSendRetryReq(false), _id(id)
         {}
@@ -106,7 +108,7 @@ class RouterEngine : public ClockedObject
         virtual void recvRespRetry();
     };
 
-    class InternalRespPort : public RouterRespPort
+    class GPNRespPort : public ResponsePort
     {
       private:
         RouterEngine* owner;
@@ -114,7 +116,7 @@ class RouterEngine : public ClockedObject
         PortID _id;
 
       public:
-        InternalRespPort(const std::string& name, RouterEngine* owner, PortID id):
+        GPNRespPort(const std::string& name, RouterEngine* owner, PortID id):
           ResponsePort(name, owner),
           owner(owner), needSendRetryReq(false), _id(id)
         {}
@@ -130,39 +132,27 @@ class RouterEngine : public ClockedObject
         virtual void recvRespRetry();
     };
 
-//     struct RequestQueue {
-//     std::queue<PacketPtr> reqQ;
-//     const uint32_t reqQSize;
-//     const PortID PortId;
-
-//     bool blocked() {
-//         return reqQ.size() == reqQSize;
-//     }
+  bool handleRequest(PortID portId, PacketPtr pkt);
+  std::vector<GPTReqPort> gptReqPorts;
+  std::vector<GPTRespPort> gptRespPorts;
 
-//     void push(PacketPtr pkt) {
-//         reqQ.push(pkt);
-//     }
+  std::vector<GPNReqPort> gpnReqPorts;
+  std::vector<GPNRespPort> gpnRespPorts;
 
-//     bool emptyQ(){
-//         return reqQ.empty();
-//       }
-      
-//     RequestQueue(uint32_t reqQSize, PortID portId):
-//     reqQSize(reqQSize),
-//     portId(portId) {}
-//   };
+  std::unordered_map<PortID, AddrRangeList> gptAddrMap;
+  std::unordered_map<PortID, AddrRangeList> routerAddrMap;
 
-//   std::vector<RequestQueue> remoteReqQueues;
-  
-  std::vector<RouterReqPort> gptReqPorts;
-  std::vector<RouterRespPort> gptRespPorts;
+  std::unordered_map<PortID, std::queue<PacketPtr>> gptReqQueues;
+  std::unordered_map<PortID, std::queue<PacketPtr>> gpnRespQueues;
 
-  std::vector<InternalReqPort> gpnReqPorts;
-  std::vector<InternalRespPort> gpnRespPorts;
+  const uint32_t gptQSize;
+  const uint32_t gpnQSize;
 
-  std::unordered_map<PortID, AddrRangeList> gptAddrMap;
-  std::unordered_map<PortID, AddrRangeList> routerAddrMap;
+  EventFunctionWrapper nextInteralGPTGPNEvent;
+  void processNextInteralGPTGPNEvent();
 
+  EventFunctionWrapper nextRemoteGPTGPNEvent;
+  void processNextRemoteGPTGPNEvent();
 
   public:
     PARAMS(RouterEngine);

From f5797c67024d7f29fef7ba29373d79f53a6f5cf8 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Tue, 17 Jan 2023 12:29:46 -0800
Subject: [PATCH 252/287] mem: Creating router specific to SEGA

Implemented the path between Router to Router to GPTs + retry

Change-Id: I250dacafb617c447657158916897c54caf28dd4f
---
 src/accl/graph/sega/router_engine.cc | 348 ++++++++++++++++-----------
 src/accl/graph/sega/router_engine.hh |  14 ++
 2 files changed, 226 insertions(+), 136 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index c0866618b5..f92a9b4fff 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -35,7 +35,9 @@ RouterEngine::RouterEngine(const Params &params):
   gptQSize(params.gpt_queue_size),
   gpnQSize(params.gpn_queue_size),
   nextInteralGPTGPNEvent([this] { processNextInteralGPTGPNEvent(); }, name()),
-  nextRemoteGPTGPNEvent([this] { processNextRemoteGPTGPNEvent(); }, name())
+  nextRemoteGPTGPNEvent([this] { processNextRemoteGPTGPNEvent(); }, name()),
+  nextInteralGPNGPTEvent([this] { processNextInteralGPNGPTEvent(); }, name()),
+  nextRemoteGPNGPTEvent([this] { processNextRemoteGPNGPTEvent(); }, name())
 {
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
@@ -143,11 +145,6 @@ RouterEngine::GPTReqPort::recvTimingResp(PacketPtr pkt) {
     return 0;
 }
 
-void
-RouterEngine::GPTReqPort::recvReqRetry() {
-    panic("Not implemented yet!");
-}
-
 bool
 RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt) {
     panic("Not implemented yet!");
@@ -156,7 +153,23 @@ RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt) {
 
 void
 RouterEngine::GPNReqPort::recvReqRetry() {
-    panic("Not implemented yet!");
+    // We should have a blocked packet if this function is called.
+    assert(blockedPacket != nullptr);
+
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+
+    owner->wakeUpInternal();
+}
+
+void
+RouterEngine::GPTReqPort::recvReqRetry() {
+    assert(blockedPacket != nullptr);
+
+    sendPacket(blockedPacket);
+    blockedPacket = nullptr;
+
+    owner->wakeUpExternal();
 }
 
 void
@@ -168,14 +181,41 @@ RouterEngine::GPNReqPort::sendPacket(PacketPtr pkt) {
     }
 }
 
+void
+RouterEngine::GPTReqPort::sendPacket(PacketPtr pkt) {
+    panic_if(blocked(), "Should never try to send if blocked MemSide!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt)) {
+        blockedPacket = pkt;
+    }
+}
+
 Tick 
 RouterEngine::GPTRespPort::recvAtomic(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
-bool 
+void
+RouterEngine::GPTRespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+void
+RouterEngine::checkRetryExternal()
+{
+    for (int i = 0; i < gptRespPorts.size(); i++) {
+        gptRespPorts[i].checkRetryReq();
+    }
+}
+
+bool
 RouterEngine::GPTRespPort::recvTimingReq(PacketPtr pkt) {
     if (!owner->handleRequest(id(), pkt)) {
+        needSendRetryReq = true;
         return false;
     }
     return true;
@@ -205,10 +245,9 @@ RouterEngine::processNextInteralGPTGPNEvent()
 {
     bool found = false;
     int queues_none_empty = 0;
-    for (auto queue = gptReqQueues.begin(); 
-              queue != gptReqQueues.end(); ++queue) {
-        if (!queue->second.empty()) {
-            PacketPtr pkt = queue->second.front();
+    for (auto &queue: gptReqQueues) {
+        if (!queue.second.empty()) {
+            PacketPtr pkt = queue.second.front();
             Addr pkt_addr = pkt->getAddr();
             queues_none_empty += 1;
             for (int i = 0; i < gpnReqPorts.size(); i++) {
@@ -216,10 +255,11 @@ RouterEngine::processNextInteralGPTGPNEvent()
                 if ((contains(addr_list, pkt_addr))) {
                     if (gpnRespQueues[gpnReqPorts[i].id()].size() < gpnQSize) {
                         gpnRespQueues[gpnReqPorts[i].id()].push(pkt);
-                        queue->second.pop();
+                        queue.second.pop();
+                        checkRetryExternal();
                         found = true;
                         queues_none_empty -= 1;
-                        if (!queue->second.empty()) {
+                        if (!queue.second.empty()) {
                             queues_none_empty += 1;
                         }
                         if ((!nextRemoteGPTGPNEvent.scheduled())) {
@@ -247,27 +287,26 @@ RouterEngine::processNextInteralGPTGPNEvent()
 void
 RouterEngine::processNextRemoteGPTGPNEvent()
 {
-    uint32_t none_empty_queue = 0;  
-    for (auto queue = gpnRespQueues.begin(); 
-              queue != gpnRespQueues.end(); ++queue) {
-        if (!queue->second.empty()) {
-            if (!gpnReqPorts[queue->first].blocked()) {
-                PacketPtr pkt = queue->second.front();
-                gpnReqPorts[queue->first].sendPacket(pkt);
-                queue->second.pop();
+    bool none_empty_queue = false;
+    for (auto &queue: gpnRespQueues) {
+        if (!queue.second.empty()) {
+            if (!gpnReqPorts[queue.first].blocked()) {
+                PacketPtr pkt = queue.second.front();
+                gpnReqPorts[queue.first].sendPacket(pkt);
+                queue.second.pop();
                 break;
             }
         }
     }
 
-    for (auto queue = gpnRespQueues.begin(); 
-              queue != gpnRespQueues.end(); ++queue) {
-        if (!queue->second.empty()) {
-            none_empty_queue += 1;
+    for (auto &queue: gpnRespQueues) {
+        if (!queue.second.empty()) {
+            none_empty_queue = true;
+            break;
         }
     }
 
-    if (none_empty_queue > 0) {
+    if (none_empty_queue) {
         schedule(nextRemoteGPTGPNEvent, nextCycle());
     }
 }
@@ -287,124 +326,161 @@ RouterEngine::GPNRespPort::recvAtomic(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
-bool 
+void
+RouterEngine::GPNRespPort::checkRetryReq()
+{
+    if (needSendRetryReq) {
+        needSendRetryReq = false;
+        sendRetryReq();
+    }
+}
+
+void
+RouterEngine::checkRetryInternal()
+{
+    for (int i = 0; i < gpnRespPorts.size(); i++) {
+        gpnRespPorts[i].checkRetryReq();
+    }
+}
+
+bool
 RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) {
-    panic("Not implemented yet!");
+    if (!owner->handleRemoteRequest(id(), pkt)) {
+        needSendRetryReq = true;
+        return false;
+    }
+    return true;
 }
 
-void 
-RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt) {
+bool
+RouterEngine::handleRemoteRequest(PortID id, PacketPtr pkt) {
+    // std::queue<PacketPtr>& queue = routerAddrMap[id];
+    // for (auto &itr: routerAddrMap) {
+    //     if (itr.first == id) {
+
+    //     }
+    // }
+    bool accepted = false;
+    if (gpnReqQueues[id].size() < gpnQSize) {
+        gpnReqQueues[id].push(pkt);
+        accepted = true;
+    } else {
+        accepted = false;
+    }
+
+    if (accepted && (!nextInteralGPNGPTEvent.scheduled())) {
+        schedule(nextInteralGPNGPTEvent, nextCycle());
+    }
+
+    return accepted;
+}
+
+void
+RouterEngine::processNextInteralGPNGPTEvent()
+{
+    bool found = false;
+    bool queues_none_empty = false;
+    for (auto &queue: gpnReqQueues) {
+        if (!queue.second.empty()) {
+            PacketPtr pkt = queue.second.front();
+            Addr pkt_addr = pkt->getAddr();
+            for (int i = 0; i < gptReqPorts.size(); i++) {
+                AddrRangeList addr_list = gptAddrMap[gptReqPorts[i].id()];
+                if ((contains(addr_list, pkt_addr))) {
+                    if (gptRespQueues[gpnReqPorts[i].id()].size() < gptQSize) {
+                        gptRespQueues[gpnReqPorts[i].id()].push(pkt);
+                        queue.second.pop();
+                        checkRetryInternal();
+                        found = true;
+                        if ((!nextRemoteGPNGPTEvent.scheduled())) {
+                            schedule(nextRemoteGPNGPTEvent, nextCycle());
+                        }
+                        break;
+                    } else {
+                        found = false;
+                        break;
+                    }
+                }
+            }
+        }
+        if (found) {
+            break;
+        }
+    }
+    for (auto &queue: gpnReqQueues) {
+        if (!queue.second.empty()) {
+            queues_none_empty = true;
+        }
+    }
+
+    if (queues_none_empty && !nextInteralGPNGPTEvent.scheduled()) {
+        schedule(nextInteralGPNGPTEvent, nextCycle());
+    }
+}
+
+void
+RouterEngine::processNextRemoteGPNGPTEvent()
+{
+    bool none_empty_queue = false;
+    for (auto &queue: gptRespQueues) {
+        if (!queue.second.empty()) {
+            if (!gptReqPorts[queue.first].blocked()) {
+                PacketPtr pkt = queue.second.front();
+                gptReqPorts[queue.first].sendPacket(pkt);
+                queue.second.pop();
+                break;
+            }
+        }
+    }
+
+    for (auto &queue: gptRespQueues) {
+        if (!queue.second.empty()) {
+            none_empty_queue = true;
+            break;
+        }
+    }
+
+    if (none_empty_queue) {
+        schedule(nextRemoteGPNGPTEvent, nextCycle());
+    }
+}
+
+void
+RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt)
+{
     panic("Not implemented yet!");
 }
 
 void 
-RouterEngine::GPNRespPort::recvRespRetry() {
+RouterEngine::GPNRespPort::recvRespRetry()
+{
     panic("Not implemented yet!");
 }
 
-// bool
-// Router::handleRemoteRequest(PortID portId, PacketPtr pkt)
-// {
-//     auto queue = find_if(remoteReqQueues.begin(), remoteReqQueues.end(),
-//         [portId](RequestQueue &obj) {return obj.cpuPortId == portId;});
-//     if (queue->blocked()) {
-//         queue->sendReadRetry = true;
-//         return false;
-//     }
-    
-//     queue->push(pkt);
-//     if (!nextRemoteReqEvent.scheduled()) {
-//         schedule(nextReqEvent, nextCycle());
-//     }
-//     return true;
-// }
-
-
-// void
-// Router::processNextRemoteReqEvent() {
-//     RequestQueue *queue = NULL;
-//     for (auto &it : remoteReqQueues) {
-//         if (!it.emptyRead()) {
-//             queue = &it;
-//             break;
-//         }
-//     }
-
-//     if (queue == nullptr) {
-//         return;
-//     }
-
-//     PacketPtr pkt;
-//     std::vector<MemSidePort>::iterator memPort;
-//     while (true) {
-        
-//         pkt = queue->readQueue.front();
-//         AddrRange addr_range = pkt->getAddrRange();
-        
-//         PortID localRespPortID = memPortMap.contains(addr_range)->second;
-        
-//         localRespPort = find_if(localRespPorts.begin(), localRespPorts.end(),
-//             [memPortId](LocalRespPort &obj)
-//             {return obj.portId() == localRespPortID;});
-
-//         if (!localRespPort->blocked()) {
-//             break;
-//         }
-//         else {
-        
-//         }
-//     }
-
-//     DPRINTF(MemScheduler, "processNextReqEvent: "
-//         "Port not blocked! Sending the packet\n");
-//     PortID cpuPortId = pick->cpuPortId;
-
-//     memPort->sendPacket(pkt);
-//     pick->timesChecked++;
-
-    
-        
-//     entryTimes.erase(pkt);
-//     pick->readQueue.pop();
-
-//     if (!nextReqEvent.scheduled()) {
-//         for (auto &queue : requestQueues) {
-//             if (!queue.emptyRead() || queue.serviceWrite()) {
-//                 DPRINTF(MemScheduler, "processNextReqEvent: "
-//                     "Scheduling nextReqEvent in processNextReqEvent\n");
-//                 schedule(nextReqEvent, nextCycle());
-//                 break;
-//             }
-//         }
-//     }
-
-    
-//     if (pick->sendReadRetry && !pick->blocked(pkt->isRead()
-//         || pkt->isWrite())) {
-//         PortID cpuPortId = pick->cpuPortId;
-//         auto cpuPort = find_if(cpuPorts.begin(), cpuPorts.end(),
-//             [cpuPortId](CPUSidePort &obj)
-//             {return obj.portId() == cpuPortId;});
-//         DPRINTF(MemScheduler, "processNextReqEvent: "
-//         "Sending read retry to ports previously blocked\n");
-//         cpuPort->trySendRetry();
-//         pick->sendReadRetry = false;
-//     }
-
-//     return;
-// }
-
-// void Router::recvRangeChange(PortID portId)
-// {
-//     for (auto &port : localRespPorts) {
-//         if (port.portId() == portId) {
-//             AddrRangeList ranges = port.getAddrRanges();
-//             for (auto &r : ranges) {
-//                 localPortMap.insert(r, portId);
-//             }
-//         }
-//     }
-//     sendRangeChange();
-// }
+void
+RouterEngine::wakeUpExternal()
+{
+    if (!nextRemoteGPNGPTEvent.scheduled()) {
+        for (auto &queue: gptRespQueues) {
+            if (!queue.second.empty()) {
+                schedule(nextRemoteGPNGPTEvent, nextCycle());
+                return;
+            }
+        }
+    }
+}
+
+void
+RouterEngine::wakeUpInternal()
+{
+    if (!nextRemoteGPTGPNEvent.scheduled()) {
+        for (auto &queue: gpnRespQueues) {
+            if (!queue.second.empty()) {
+                schedule(nextRemoteGPTGPNEvent, nextCycle());
+                return;
+            }
+        }
+    }
+}
 
 }// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 413dc21bc4..5068e12276 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -133,6 +133,11 @@ class RouterEngine : public ClockedObject
     };
 
   bool handleRequest(PortID portId, PacketPtr pkt);
+  bool handleRemoteRequest(PortID portId, PacketPtr pkt);
+  void wakeUpInternal();
+  void wakeUpExternal();
+  void checkRetryExternal();
+  void checkRetryInternal();
   std::vector<GPTReqPort> gptReqPorts;
   std::vector<GPTRespPort> gptRespPorts;
 
@@ -145,6 +150,9 @@ class RouterEngine : public ClockedObject
   std::unordered_map<PortID, std::queue<PacketPtr>> gptReqQueues;
   std::unordered_map<PortID, std::queue<PacketPtr>> gpnRespQueues;
 
+  std::unordered_map<PortID, std::queue<PacketPtr>> gptRespQueues;
+  std::unordered_map<PortID, std::queue<PacketPtr>> gpnReqQueues;
+
   const uint32_t gptQSize;
   const uint32_t gpnQSize;
 
@@ -154,6 +162,12 @@ class RouterEngine : public ClockedObject
   EventFunctionWrapper nextRemoteGPTGPNEvent;
   void processNextRemoteGPTGPNEvent();
 
+  EventFunctionWrapper nextInteralGPNGPTEvent;
+  void processNextInteralGPNGPTEvent();
+
+  EventFunctionWrapper nextRemoteGPNGPTEvent;
+  void processNextRemoteGPNGPTEvent();
+
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);

From f793cfff09eaece9494d6503f8a9be98d668a50f Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 30 Jan 2023 17:04:43 -0800
Subject: [PATCH 253/287] accl: Fixing the router

Retry mechanism and the config files are fixed. Tested with multiple GPTs/Router
---
 configs/accl/sega_simple.py                |   1 +
 configs/accl/sega_simple_pt2pt.py          |  40 ++-
 src/accl/graph/sega/CenteralController.py  |   2 +
 src/accl/graph/sega/RouterEngine.py        |  19 +-
 src/accl/graph/sega/SConscript             |   3 +-
 src/accl/graph/sega/centeral_controller.cc |   9 +
 src/accl/graph/sega/centeral_controller.hh |   3 +
 src/accl/graph/sega/router_engine.cc       | 325 ++++++++++++++-------
 src/accl/graph/sega/router_engine.hh       |  45 +--
 9 files changed, 290 insertions(+), 157 deletions(-)

diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 2d36ec584d..ce7ad982e6 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -170,6 +170,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.gpts = gpts
 
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.router_vector = []
 
     def work_count(self):
         return self.ctrl.workCount()
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index 2c75ea4896..d646ded0b0 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -140,6 +140,8 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
+        GPTPerGPN = 2
+
         # Building the CenteralController
         self.ctrl = CenteralController(
             vertex_image_file=f"{graph_path}/vertices"
@@ -147,7 +149,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("16GiB")
+            mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
@@ -164,24 +166,33 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
             gpts.append(gpt)
-            routers.append(RouterEngine())
+        for i in range(int(num_gpts/GPTPerGPN)):
+            routers.append(
+                        RouterEngine(gpn_queue_size = 64, gpt_queue_size = 64))
         self.routers = routers
-        print(len(gpts))
-        # self.router = RouterEngine()
-
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            # gpt_0.setRespPort(self.router.gpt_req_side)
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
+        # for gpt_0 in gpts:
+        #     for gpt_1 in gpts:
+        #         gpt_0.setReqPort(gpt_1.getRespPort())
+        print("gpt, gpt")
+        for i in range(len(gpts)):
+            for j in range(len(gpts)):
+                if (int(i / GPTPerGPN) == int(j / GPTPerGPN) ):
+                    print(i, j)
+                    gpts[i].setReqPort(gpts[j].getRespPort())
+        print("gpt, Router")
         for i in range(len(gpts)):
-            gpts[i].setRespPort(routers[i].gpt_req_side)
+            for j in range(len(routers)):
+                if (int(i / GPTPerGPN) == j):
+                    print(i, j)
+                    gpts[i].setRespPort(routers[j].gpt_req_side)
+                    gpts[i].setReqPort(routers[j].gpt_resp_side)
         for r_0 in routers:
             for r_1 in routers:
                 if r_0 != r_1:
                     r_0.gpn_resp_side = r_1.gpn_req_side
         self.gpts = gpts
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.router_vector = [r for r in self.routers]
 
     def work_count(self):
         return self.ctrl.workCount()
@@ -207,8 +218,11 @@ def create_sssp_workload(self, init_addr, init_value):
     def create_cc_workload(self):
         self.ctrl.createCCWorkload()
 
-    def create_pr_workload(self, alpha):
-        self.ctrl.createPRWorkload(alpha)
+    def create_async_pr_workload(self, alpha, threshold):
+        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+
+    def create_pr_workload(self, num_nodes, alpha):
+        self.ctrl.createPRWorkload(num_nodes, alpha)
 
     def create_bc_workload(self, init_addr, init_value):
         self.ctrl.createBCWorkload(init_addr, init_value)
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index c5f44c82e9..211b1a694b 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -41,6 +41,8 @@ class CenteralController(ClockedObject):
 
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
+    router_vector = VectorParam.RouterEngine("All Routers in the system.")
+
     cxx_exports = [
                     PyBindMethod("setAsyncMode"),
                     PyBindMethod("setBSPMode"),
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
index 9fed906b83..a07ae30bdd 100644
--- a/src/accl/graph/sega/RouterEngine.py
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -34,25 +34,12 @@ class RouterEngine(ClockedObject):
     cxx_header = "accl/graph/sega/router_engine.hh"
     cxx_class = "gem5::RouterEngine"
 
-    # push_req_queue_size = Param.Int("Size of the queue to "
-    #                                 "queue push requests.")
-    # # resp_queue_size should probably be
-    # # significantly bigger than push_req_queue_size
-    # resp_queue_size = Param.Int("Size of the response queue in the "
-    #                                 "push engine where it stores the "
-    #                                 "edges read from memory.")
-
-    # max_propagates_per_cycle = Param.Int("Maximum number of propagates "
-    #                                                     "done per cycle.")
-
-    # update_queue_size = Param.Int("Maximum number of entries "
-    #                                 "for each update queue.")
+    system = Param.System(Parent.any, "System this Engine is a part of")
 
     gpt_req_side = VectorRequestPort("Outgoing ports to local GPTs")
     gpt_resp_side = VectorResponsePort("incoming ports from local GPTs")
     
     gpn_req_side = VectorRequestPort("Outgoing ports to remote GPNs")
     gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
-    gpt_queue_size = Param.Int(8, "Queue size on the gpt side")
-    gpn_queue_size = Param.Int(8, "Queue size on the gpt side")
-    # remote_resp_side = VectorRsponsePort("Incoming ports from GPNs to router")
+    gpt_queue_size = Param.Int(64, "Queue size on the gpt side")
+    gpn_queue_size = Param.Int(64, "Queue size on the gpt side")
diff --git a/src/accl/graph/sega/SConscript b/src/accl/graph/sega/SConscript
index a7d9096cca..e0a3f8d28f 100644
--- a/src/accl/graph/sega/SConscript
+++ b/src/accl/graph/sega/SConscript
@@ -52,6 +52,7 @@ DebugFlag("PushEngine")
 DebugFlag("SEGAStructureSize")
 DebugFlag("MSDebug")
 DebugFlag("WLEngine")
+DebugFlag("RouterEngine")
 
 CompoundFlag("MPU", ["CoalesceEngine", "PushEngine",
-                    "WLEngine", "BaseMemoryEngine"])
+                    "WLEngine", "BaseMemoryEngine", "RouterEngine"])
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0aee3b77ce..46c6133947 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -49,6 +49,11 @@ CenteralController::CenteralController(const Params& params):
         mpuVector.push_back(mpu);
         mpu->registerCenteralController(this);
     }
+
+    for (auto router : params.router_vector) {
+        routerVector.push_back(router);
+        router->registerCenteralController(this);
+    }
 }
 
 void
@@ -175,6 +180,10 @@ CenteralController::recvDoneSignal()
         done &= mpu->done();
     }
 
+    for (auto router : routerVector) {
+        done &= router->done();
+    }
+
     if (done && mode == ProcessingMode::ASYNCHRONOUS) {
         exitSimLoopNow("no update left to process.");
     }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index cce9ac2725..e1f3f413b5 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -35,6 +35,7 @@
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
+#include "accl/graph/sega/router_engine.hh"
 #include "base/addr_range.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
@@ -52,6 +53,8 @@ class CenteralController : public ClockedObject
     ProcessingMode mode;
 
     std::vector<MPU*> mpuVector;
+    std::vector<RouterEngine*> routerVector;
+
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
     PacketPtr createReadPacket(Addr addr, unsigned int size);
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index f92a9b4fff..2f9ea95631 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -28,17 +28,26 @@
 
 #include "accl/graph/sega/router_engine.hh"
 
+#include "accl/graph/sega/centeral_controller.hh"
+#include "base/trace.hh"
+#include "debug/RouterEngine.hh"
+
 namespace gem5
 {
 RouterEngine::RouterEngine(const Params &params):
   ClockedObject(params),
+  system(params.system),
   gptQSize(params.gpt_queue_size),
   gpnQSize(params.gpn_queue_size),
-  nextInteralGPTGPNEvent([this] { processNextInteralGPTGPNEvent(); }, name()),
-  nextRemoteGPTGPNEvent([this] { processNextRemoteGPTGPNEvent(); }, name()),
-  nextInteralGPNGPTEvent([this] { processNextInteralGPNGPTEvent(); }, name()),
-  nextRemoteGPNGPTEvent([this] { processNextRemoteGPNGPTEvent(); }, name())
-{
+  emptyQueues(false),
+  nextGPTGPNEvent([this] { processNextGPTGPNEvent(); }, name()),
+  nextInternalRequestEvent(
+                        [this] { processNextInternalRequestEvent(); }, name()),
+  nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
+  nextExternalRequestEvent
+                        ([this] { processNextExternalRequestEvent(); }, name())
+//   nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name())
+    {
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
         gptReqPorts.emplace_back(
@@ -61,6 +70,13 @@ RouterEngine::RouterEngine(const Params &params):
     }
 }
 
+void
+RouterEngine::registerCenteralController(
+                                    CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
 AddrRangeList
 RouterEngine::GPTRespPort::getAddrRanges() const
 {
@@ -105,9 +121,6 @@ RouterEngine::init()
 {
     for (int i = 0; i < gptReqPorts.size(); i++) {
         gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
-        for (auto &addrRange: gptReqPorts[i].getAddrRanges()) {
-            std::cout<< name()<<", "<<__func__<<addrRange.to_string()<<std::endl;
-        }
     }
 }
 
@@ -116,11 +129,38 @@ RouterEngine::startup()
 {
     for (int i = 0; i < gpnReqPorts.size(); i++) {
         routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
-        for (auto &addrRange: gpnReqPorts[i].getAddrRanges()) {
-            std::cout<< name()<<", "<<__func__<<addrRange.to_string()<<std::endl;
-        }
     }
-    std::cout<<"******************"<<std::endl;
+}
+
+bool
+RouterEngine::done()
+{
+    bool emptygptReq = true;
+    bool emptygpnReq = true;
+    bool emptygptResp = true;
+    bool emptygpnResp = true;
+    bool empty = true;
+    for (auto &q: gptReqQueues) {
+        emptygptReq &= q.second.empty();
+    }
+
+    for (auto &q: gpnReqQueues) {
+        emptygpnReq &= q.second.empty();
+    }
+
+    for (auto &q: gptRespQueues) {
+        emptygptResp &= q.second.empty();
+    }
+
+    for (auto &q: gpnRespQueues) {
+        emptygpnResp &= q.second.empty();
+    }
+
+    empty = emptygptReq & emptygpnReq & emptygptResp & emptygpnResp;
+    DPRINTF(RouterEngine, "%s: emptygptReq: %d, emptygpnReq: %d, "
+                "emptygptResp: %d, emptygpnResp: %d.\n", __func__, emptygptReq,
+                                    emptygpnReq, emptygptResp, emptygpnResp);
+    return empty;
 }
 
 Port&
@@ -140,36 +180,48 @@ RouterEngine::getPort(const std::string& if_name, PortID idx)
 }
 
 bool
-RouterEngine::GPTReqPort::recvTimingResp(PacketPtr pkt) {
+RouterEngine::GPTReqPort::recvTimingResp(PacketPtr pkt)
+{
     panic("Not implemented yet!");
     return 0;
 }
 
+void
+RouterEngine::GPTReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(RouterEngine, "%s: ReqPort %d received a reqRetry. "
+                "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(RouterEngine, "%s: blockedPacket sent successfully.\n",
+                                                                    __func__);
+        owner->recvReqRetry();
+    }
+}
+
 bool
-RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt) {
+RouterEngine::GPNReqPort::recvTimingResp(PacketPtr pkt)
+{
     panic("Not implemented yet!");
     return 0;
 }
 
 void
-RouterEngine::GPNReqPort::recvReqRetry() {
+RouterEngine::GPNReqPort::recvReqRetry()
+{
     // We should have a blocked packet if this function is called.
     assert(blockedPacket != nullptr);
-
-    sendPacket(blockedPacket);
+    PacketPtr pkt = blockedPacket;
     blockedPacket = nullptr;
 
-    owner->wakeUpInternal();
-}
+    sendPacket(pkt);
 
-void
-RouterEngine::GPTReqPort::recvReqRetry() {
-    assert(blockedPacket != nullptr);
-
-    sendPacket(blockedPacket);
-    blockedPacket = nullptr;
-
-    owner->wakeUpExternal();
+    owner->wakeUpInternal();
 }
 
 void
@@ -177,6 +229,7 @@ RouterEngine::GPNReqPort::sendPacket(PacketPtr pkt) {
     panic_if(blocked(), "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt)) {
+        DPRINTF(RouterEngine, "%s: The GPNReq port is blocked.\n", __func__);
         blockedPacket = pkt;
     }
 }
@@ -186,6 +239,7 @@ RouterEngine::GPTReqPort::sendPacket(PacketPtr pkt) {
     panic_if(blocked(), "Should never try to send if blocked MemSide!");
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt)) {
+        DPRINTF(RouterEngine, "%s: The GPTReq port is blocked.\n", __func__);
         blockedPacket = pkt;
     }
 }
@@ -204,97 +258,124 @@ RouterEngine::GPTRespPort::checkRetryReq()
     }
 }
 
-void
-RouterEngine::checkRetryExternal()
-{
-    for (int i = 0; i < gptRespPorts.size(); i++) {
-        gptRespPorts[i].checkRetryReq();
-    }
-}
-
 bool
 RouterEngine::GPTRespPort::recvTimingReq(PacketPtr pkt) {
     if (!owner->handleRequest(id(), pkt)) {
+        DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n",
+                    __func__, pkt->getAddr());
         needSendRetryReq = true;
         return false;
     }
     return true;
 }
 
+void
+RouterEngine::recvReqRetry()
+{
+    DPRINTF(RouterEngine, "%s: Received a reqRetry.\n", __func__);
+    if (!nextExternalRequestEvent.scheduled()) {
+        schedule(nextExternalRequestEvent, nextCycle());
+    }
+}
+
 bool
 RouterEngine::handleRequest(PortID portId, PacketPtr pkt)
 {
-    auto queue = gptReqQueues[portId];
+    auto &queue = gptReqQueues[portId];
     bool accepted = false;
     if (queue.size() < gptQSize) {
+        DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] size is: %d.\n",
+                                            __func__, portId, queue.size());
         gptReqQueues[portId].push(pkt);
         accepted = true;
     } else {
+         DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] is full: %d.\n",
+                                                            __func__, portId);
         accepted = false;
     }
 
-    if(accepted && (!nextInteralGPTGPNEvent.scheduled())) {
-        schedule(nextInteralGPTGPNEvent, nextCycle());
+    if (accepted && (!nextGPTGPNEvent.scheduled())) {
+        schedule(nextGPTGPNEvent, nextCycle());
     }
-
+    DPRINTF(RouterEngine, "%s: GPT sent req to router: accepted: %d.\n",
+                                                        __func__, accepted);
     return accepted;
 }
 
 void
-RouterEngine::processNextInteralGPTGPNEvent()
+RouterEngine::processNextGPTGPNEvent()
 {
     bool found = false;
-    int queues_none_empty = 0;
+    bool queues_none_empty = false;
+    DPRINTF(RouterEngine, "%s: Trying to send a request from GPT to GPN.\n",
+                                                                    __func__);
     for (auto &queue: gptReqQueues) {
         if (!queue.second.empty()) {
             PacketPtr pkt = queue.second.front();
             Addr pkt_addr = pkt->getAddr();
-            queues_none_empty += 1;
             for (int i = 0; i < gpnReqPorts.size(); i++) {
                 AddrRangeList addr_list = routerAddrMap[gpnReqPorts[i].id()];
                 if ((contains(addr_list, pkt_addr))) {
                     if (gpnRespQueues[gpnReqPorts[i].id()].size() < gpnQSize) {
                         gpnRespQueues[gpnReqPorts[i].id()].push(pkt);
+                        DPRINTF(RouterEngine, "%s: Pushing the pkt %s to  "
+                                "gpnRespQueue[%d]. gpnRespQueue size is: %d\n",
+                                __func__, pkt->getAddr(), i,
+                                gpnRespQueues[gpnReqPorts[i].id()].size());
                         queue.second.pop();
-                        checkRetryExternal();
-                        found = true;
-                        queues_none_empty -= 1;
-                        if (!queue.second.empty()) {
-                            queues_none_empty += 1;
+                        DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n",
+                                                __func__, queue.second.size());
+                        found |= true;
+                        if ((!nextInternalRequestEvent.scheduled())) {
+                            schedule(nextInternalRequestEvent, nextCycle());
                         }
-                        if ((!nextRemoteGPTGPNEvent.scheduled())) {
-                            schedule(nextRemoteGPTGPNEvent, nextCycle());
-                        }
-                        break;
                     // queue is full
                     } else {
-                        found = false;
-                        break;
+                         DPRINTF(RouterEngine, "%s: gpnRespQueue[%d] is full."
+                            "\n", __func__, pkt->getAddr(), i);
+                        found |= false;
                     }
                 }
             }
         }
         if (found) {
-            break;
+            checkGPTRetryReq();
+        }
+    }
+
+    for (auto &queue: gptReqQueues)
+    {
+        if (!queue.second.empty()) {
+            queues_none_empty = true;
         }
     }
 
-    if (queues_none_empty > 0) {
-        schedule(nextInteralGPTGPNEvent, nextCycle());
+    if (queues_none_empty) {
+        DPRINTF(RouterEngine, "%s: The gptReqQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gptReqQueues is empty.\n", __func__);
+    }
+
+    if (queues_none_empty && (!nextGPTGPNEvent.scheduled())) {
+        schedule(nextGPTGPNEvent, nextCycle());
     }
 }
 
 void
-RouterEngine::processNextRemoteGPTGPNEvent()
+RouterEngine::processNextInternalRequestEvent()
 {
+    DPRINTF(RouterEngine, "%s: Sending a request between two routers.\n",
+                                                                    __func__);
     bool none_empty_queue = false;
     for (auto &queue: gpnRespQueues) {
         if (!queue.second.empty()) {
             if (!gpnReqPorts[queue.first].blocked()) {
                 PacketPtr pkt = queue.second.front();
+                DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
+                    __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
                 gpnReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
-                break;
             }
         }
     }
@@ -307,16 +388,23 @@ RouterEngine::processNextRemoteGPTGPNEvent()
     }
 
     if (none_empty_queue) {
-        schedule(nextRemoteGPTGPNEvent, nextCycle());
+        DPRINTF(RouterEngine, "%s: The gpnRespQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gpnRespQueues is empty.\n", __func__);
+    }
+
+    if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
+        schedule(nextInternalRequestEvent, nextCycle());
     }
 }
 
-void 
+void
 RouterEngine::GPTRespPort::recvFunctional(PacketPtr pkt) {
     panic("Not implemented yet!");
 }
 
-void 
+void
 RouterEngine::GPTRespPort::recvRespRetry() {
     panic("Not implemented yet!");
 }
@@ -327,25 +415,18 @@ RouterEngine::GPNRespPort::recvAtomic(PacketPtr pkt) {
 }
 
 void
-RouterEngine::GPNRespPort::checkRetryReq()
-{
+RouterEngine::GPNRespPort::checkRetryReq() {
     if (needSendRetryReq) {
         needSendRetryReq = false;
         sendRetryReq();
     }
 }
 
-void
-RouterEngine::checkRetryInternal()
-{
-    for (int i = 0; i < gpnRespPorts.size(); i++) {
-        gpnRespPorts[i].checkRetryReq();
-    }
-}
-
 bool
 RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) {
     if (!owner->handleRemoteRequest(id(), pkt)) {
+        DPRINTF(RouterEngine, "%s: Router Rejected the packet %s.\n",
+                    __func__, pkt->getAddr());
         needSendRetryReq = true;
         return false;
     }
@@ -354,12 +435,6 @@ RouterEngine::GPNRespPort::recvTimingReq(PacketPtr pkt) {
 
 bool
 RouterEngine::handleRemoteRequest(PortID id, PacketPtr pkt) {
-    // std::queue<PacketPtr>& queue = routerAddrMap[id];
-    // for (auto &itr: routerAddrMap) {
-    //     if (itr.first == id) {
-
-    //     }
-    // }
     bool accepted = false;
     if (gpnReqQueues[id].size() < gpnQSize) {
         gpnReqQueues[id].push(pkt);
@@ -368,15 +443,17 @@ RouterEngine::handleRemoteRequest(PortID id, PacketPtr pkt) {
         accepted = false;
     }
 
-    if (accepted && (!nextInteralGPNGPTEvent.scheduled())) {
-        schedule(nextInteralGPNGPTEvent, nextCycle());
+    if (accepted && (!nextGPNGPTEvent.scheduled())) {
+        schedule(nextGPNGPTEvent, nextCycle());
     }
 
+    DPRINTF(RouterEngine, "%s: The remote packet: %s is accepted: %d.\n",
+                                        __func__, pkt->getAddr(), accepted);
     return accepted;
 }
 
 void
-RouterEngine::processNextInteralGPNGPTEvent()
+RouterEngine::processNextGPNGPTEvent()
 {
     bool found = false;
     bool queues_none_empty = false;
@@ -387,53 +464,73 @@ RouterEngine::processNextInteralGPNGPTEvent()
             for (int i = 0; i < gptReqPorts.size(); i++) {
                 AddrRangeList addr_list = gptAddrMap[gptReqPorts[i].id()];
                 if ((contains(addr_list, pkt_addr))) {
-                    if (gptRespQueues[gpnReqPorts[i].id()].size() < gptQSize) {
-                        gptRespQueues[gpnReqPorts[i].id()].push(pkt);
+                    if (gptRespQueues[gptReqPorts[i].id()].size() < gptQSize) {
+                        gptRespQueues[gptReqPorts[i].id()].push(pkt);
+                        DPRINTF(RouterEngine, "%s: The size of "
+                                    "gptRespQueues[%d] is %d.\n", __func__, i,
+                                    gptRespQueues[gptReqPorts[i].id()].size());
+                        DPRINTF(RouterEngine,
+                                    "%s: Sending pkt %s to GPT %d.\n",
+                                    __func__, pkt->getAddr(), i);
                         queue.second.pop();
-                        checkRetryInternal();
-                        found = true;
-                        if ((!nextRemoteGPNGPTEvent.scheduled())) {
-                            schedule(nextRemoteGPNGPTEvent, nextCycle());
+                        found |= true;
+                        if ((!nextExternalRequestEvent.scheduled())) {
+                            schedule(nextExternalRequestEvent, nextCycle());
                         }
-                        break;
                     } else {
-                        found = false;
-                        break;
+                        DPRINTF(RouterEngine,
+                                    "%s: gptRespQueues[%d] is full.\n",
+                                    __func__, pkt->getAddr(), i);
+                        found |= false;
                     }
                 }
             }
         }
         if (found) {
-            break;
+            checkGPNRetryReq();
         }
     }
+
     for (auto &queue: gpnReqQueues) {
         if (!queue.second.empty()) {
             queues_none_empty = true;
         }
     }
 
-    if (queues_none_empty && !nextInteralGPNGPTEvent.scheduled()) {
-        schedule(nextInteralGPNGPTEvent, nextCycle());
+    if (queues_none_empty) {
+        DPRINTF(RouterEngine, "%s: gpnReqQueues is not empty.\n", __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: gpnReqQueues is empty.\n", __func__);
+    }
+
+    if (queues_none_empty && (!nextGPNGPTEvent.scheduled())) {
+        schedule(nextGPNGPTEvent, nextCycle());
     }
 }
 
 void
-RouterEngine::processNextRemoteGPNGPTEvent()
+RouterEngine::processNextExternalRequestEvent()
 {
+    DPRINTF(RouterEngine, "%s: Sending the request to the GPT.\n", __func__);
     bool none_empty_queue = false;
     for (auto &queue: gptRespQueues) {
         if (!queue.second.empty()) {
             if (!gptReqPorts[queue.first].blocked()) {
                 PacketPtr pkt = queue.second.front();
+                DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. "
+                        "the size is: %d.\n", __func__,
+                         gptReqPorts[queue.first].id() ,queue.second.size());
+                DPRINTF(RouterEngine, "%s: Sending packet %s to GPT: %d.\n",
+                    __func__, pkt->getAddr(),gptReqPorts[queue.first].id());
                 gptReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
-                break;
             }
         }
     }
 
     for (auto &queue: gptRespQueues) {
+        DPRINTF(RouterEngine, "%s: gptRespQueues[%d] size is: %d.\n", __func__,
+                        gptReqPorts[queue.first].id() ,queue.second.size());
         if (!queue.second.empty()) {
             none_empty_queue = true;
             break;
@@ -441,7 +538,16 @@ RouterEngine::processNextRemoteGPNGPTEvent()
     }
 
     if (none_empty_queue) {
-        schedule(nextRemoteGPNGPTEvent, nextCycle());
+        DPRINTF(RouterEngine, "%s: The gptRespQueues is not empty.\n",
+                                                                    __func__);
+    } else {
+        DPRINTF(RouterEngine, "%s: The gptRespQueues is empty.\n", __func__);
+    }
+
+    if (none_empty_queue) {
+        if (!nextExternalRequestEvent.scheduled()) {
+            schedule(nextExternalRequestEvent, nextCycle());
+        }
     }
 }
 
@@ -451,19 +557,19 @@ RouterEngine::GPNRespPort::recvFunctional(PacketPtr pkt)
     panic("Not implemented yet!");
 }
 
-void 
+void
 RouterEngine::GPNRespPort::recvRespRetry()
 {
     panic("Not implemented yet!");
 }
 
 void
-RouterEngine::wakeUpExternal()
+RouterEngine::wakeUpInternal()
 {
-    if (!nextRemoteGPNGPTEvent.scheduled()) {
-        for (auto &queue: gptRespQueues) {
+    if ((!nextInternalRequestEvent.scheduled())) {
+        for (auto &queue: gpnRespQueues) {
             if (!queue.second.empty()) {
-                schedule(nextRemoteGPNGPTEvent, nextCycle());
+                schedule(nextInternalRequestEvent, nextCycle());
                 return;
             }
         }
@@ -471,15 +577,18 @@ RouterEngine::wakeUpExternal()
 }
 
 void
-RouterEngine::wakeUpInternal()
+RouterEngine::checkGPTRetryReq()
 {
-    if (!nextRemoteGPTGPNEvent.scheduled()) {
-        for (auto &queue: gpnRespQueues) {
-            if (!queue.second.empty()) {
-                schedule(nextRemoteGPTGPNEvent, nextCycle());
-                return;
-            }
-        }
+    for (int i = 0; i < gptRespPorts.size(); i++) {
+        gptRespPorts[i].checkRetryReq();
+    }
+}
+
+void
+RouterEngine::checkGPNRetryReq()
+{
+    for (int i = 0; i < gpnRespPorts.size(); i++) {
+        gpnRespPorts[i].checkRetryReq();
     }
 }
 
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 5068e12276..1cc1282d4a 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -29,16 +29,17 @@
 #ifndef __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
 
-#include "params/RouterEngine.hh"
-#include "sim/clocked_object.hh"
+#include <queue>
+
 #include "mem/packet.hh"
 #include "mem/port.hh"
-
-#include <queue>
+#include "params/RouterEngine.hh"
+#include "sim/clocked_object.hh"
+#include "sim/system.hh"
 
 namespace gem5
 {
-
+class CenteralController;
 class RouterEngine : public ClockedObject
 {
   private:
@@ -132,6 +133,8 @@ class RouterEngine : public ClockedObject
         virtual void recvRespRetry();
     };
 
+  System* system;
+  CenteralController* centeralController;
   bool handleRequest(PortID portId, PacketPtr pkt);
   bool handleRemoteRequest(PortID portId, PacketPtr pkt);
   void wakeUpInternal();
@@ -155,35 +158,39 @@ class RouterEngine : public ClockedObject
 
   const uint32_t gptQSize;
   const uint32_t gpnQSize;
+  bool emptyQueues;
+
+  EventFunctionWrapper nextGPTGPNEvent;
+  void processNextGPTGPNEvent();
 
-  EventFunctionWrapper nextInteralGPTGPNEvent;
-  void processNextInteralGPTGPNEvent();
+  EventFunctionWrapper nextInternalRequestEvent;
+  void processNextInternalRequestEvent();
 
-  EventFunctionWrapper nextRemoteGPTGPNEvent;
-  void processNextRemoteGPTGPNEvent();
+  EventFunctionWrapper nextGPNGPTEvent;
+  void processNextGPNGPTEvent();
 
-  EventFunctionWrapper nextInteralGPNGPTEvent;
-  void processNextInteralGPNGPTEvent();
+  EventFunctionWrapper nextExternalRequestEvent;
+  void processNextExternalRequestEvent();
 
-  EventFunctionWrapper nextRemoteGPNGPTEvent;
-  void processNextRemoteGPNGPTEvent();
+  // EventFunctionWrapper nextDoneSignalEvent;
+  // void processNextDoneSignalEvent();
 
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
-
+    void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
     virtual void startup() override;
-
     Port& getPort(const std::string& if_name,
-                PortID idx = InvalidPortID) override;
+              PortID idx = InvalidPortID) override;
 
     AddrRangeList getGPNRanges();
     AddrRangeList getGPTRanges();
+    void recvReqRetry();
 
-//   std::unordered_map<PortID, std::vector<AddrRangeList>> routerPortAddrMap;
-
-//   AddrRangeMap<PortID, 0> localPortMap;
+    void checkGPTRetryReq();
+    void checkGPNRetryReq();
+    bool done();
 
 };
 

From 14901e2ebdab187295c5247f37faa4916a6217c5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 2 Feb 2023 10:35:24 -0800
Subject: [PATCH 254/287] accl: Adding statistics + router latency to the
 router

---
 configs/accl/async-pr.py             |   2 +-
 configs/accl/pr.py                   |   2 +-
 configs/accl/sega_simple_pt2pt.py    |   9 ++-
 src/accl/graph/sega/RouterEngine.py  |   2 +
 src/accl/graph/sega/router_engine.cc |  91 ++++++++++++++++++++--
 src/accl/graph/sega/router_engine.hh | 111 ++++++++++++++++++---------
 6 files changed, 170 insertions(+), 47 deletions(-)

diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
index 0bfb6caeaa..600a04c665 100644
--- a/configs/accl/async-pr.py
+++ b/configs/accl/async-pr.py
@@ -93,7 +93,7 @@ def get_inputs():
     ) = get_inputs()
 
     if simple:
-        from sega_simple import SEGA
+        from sega_simple_pt2pt import SEGA
     else:
         from sega import SEGA
     system = SEGA(num_gpts, num_registers, cache_size, graph)
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index 569514eb82..af8669775b 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -101,7 +101,7 @@ def get_inputs():
     print(f"error_threshold: {error_threshold}")
 
     if simple:
-        from sega_simple import SEGA
+        from sega_simple_pt2pt import SEGA
     else:
         from sega import SEGA
     system = SEGA(num_gpts, num_registers, cache_size, graph)
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index d646ded0b0..a236e5c45c 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -140,7 +140,7 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        GPTPerGPN = 2
+        GPTPerGPN = 8
 
         # Building the CenteralController
         self.ctrl = CenteralController(
@@ -168,7 +168,10 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
             gpts.append(gpt)
         for i in range(int(num_gpts/GPTPerGPN)):
             routers.append(
-                        RouterEngine(gpn_queue_size = 64, gpt_queue_size = 64))
+                        RouterEngine(
+                                gpn_queue_size = 16,
+                                gpt_queue_size = 16,
+                                router_latency = "1ns"))
         self.routers = routers
         # for gpt_0 in gpts:
         #     for gpt_1 in gpts:
@@ -220,6 +223,8 @@ def create_cc_workload(self):
 
     def create_async_pr_workload(self, alpha, threshold):
         self.ctrl.createAsyncPRWorkload(alpha, threshold)
+    def get_pr_error(self):
+        return self.ctrl.getPRError()
 
     def create_pr_workload(self, num_nodes, alpha):
         self.ctrl.createPRWorkload(num_nodes, alpha)
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
index a07ae30bdd..b169bcd120 100644
--- a/src/accl/graph/sega/RouterEngine.py
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -43,3 +43,5 @@ class RouterEngine(ClockedObject):
     gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
     gpt_queue_size = Param.Int(64, "Queue size on the gpt side")
     gpn_queue_size = Param.Int(64, "Queue size on the gpt side")
+    router_latency = Param.Latency("5ns", "Router latency, "
+                                "SerDes or E-O-E latencies can be added here")
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 2f9ea95631..57ae90b565 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -30,6 +30,7 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 #include "base/trace.hh"
+#include "sim/stats.hh"
 #include "debug/RouterEngine.hh"
 
 namespace gem5
@@ -40,14 +41,16 @@ RouterEngine::RouterEngine(const Params &params):
   gptQSize(params.gpt_queue_size),
   gpnQSize(params.gpn_queue_size),
   emptyQueues(false),
+  routerLatency(params.router_latency),
   nextGPTGPNEvent([this] { processNextGPTGPNEvent(); }, name()),
   nextInternalRequestEvent(
                         [this] { processNextInternalRequestEvent(); }, name()),
   nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
-  nextExternalRequestEvent
-                        ([this] { processNextExternalRequestEvent(); }, name())
-//   nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name())
-    {
+  nextExternalRequestEvent(
+                        [this] { processNextExternalRequestEvent(); }, name()),
+//   trafficStats(*this)
+  stats(*this)
+{
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
         gptReqPorts.emplace_back(
@@ -68,6 +71,12 @@ RouterEngine::RouterEngine(const Params &params):
         gpnRespPorts.emplace_back(
                     name() + ".gpn_resp_side" + std::to_string(i), this, i);
     }
+
+    // for(int i = 0; i < gpnReqPorts.size(); i++){
+    //     stats.internalTrafficCount.push_back(new statistics::Histogram());
+    //     stats.internalTrafficCount[i]->init(10);
+    // }
+    // statistics::registerDumpCallback([this]() { collateStats(); });
 }
 
 void
@@ -376,6 +385,11 @@ RouterEngine::processNextInternalRequestEvent()
                     __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
                 gpnReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
+                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+            } 
+            else {
+                stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+            //    trafficStats.m_internalTrafficCount[gpnReqPorts[queue.first].id()]->sample(queue.second.size());
             }
         }
     }
@@ -395,7 +409,7 @@ RouterEngine::processNextInternalRequestEvent()
     }
 
     if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
-        schedule(nextInternalRequestEvent, nextCycle());
+        schedule(nextInternalRequestEvent, curTick()+routerLatency);
     }
 }
 
@@ -524,6 +538,11 @@ RouterEngine::processNextExternalRequestEvent()
                     __func__, pkt->getAddr(),gptReqPorts[queue.first].id());
                 gptReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
+                stats.externalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+            }
+             else {
+                stats.externalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+            //    trafficStats.m_internalTrafficCount[gpnReqPorts[queue.first].id()]->sample(queue.second.size());
             }
         }
     }
@@ -592,4 +611,66 @@ RouterEngine::checkGPNRetryReq()
     }
 }
 
+// RouterEngine::
+// RouterEngineStats::RouterEngineStats(RouterEngine &_router)
+//     : statistics::Group(&_router),
+//     router(_router)
+// {
+
+//     for (int i = 0; i < router.gpnReqPorts.size(); i++) {
+//         m_internalTrafficCount.push_back(new statistics::Histogram(this));
+//         m_internalTrafficCount[i]
+//             ->init(0)
+//             .flags(statistics::nozero);
+//     }
+// }
+
+// void RouterEngine::resetStats()
+// {
+//     for (int i = 0; i < gpnReqPorts.size(); i++) {
+//         trafficStats.m_internalTrafficCount[i]->reset();
+//     }
+// }
+
+// void
+// RouterEngine::regStats()
+// {
+//     ClockedObject::regStats();
+// }
+
+// void
+// RouterEngine::collateStats()
+// {
+//     for (uint32_t j = 0; j < gpnReqPorts.size(); ++j) {
+//                         trafficStats
+//                         .m_internalTrafficCount[j];
+//                         // ->add(getInternalTrafficCount(j));
+//     }
+// }
+
+RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
+    : statistics::Group(&_router),
+    router(_router),
+    ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
+             "Number of packets blocked between routers."),
+    ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
+             "Number of external packets blocked."),
+    ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of packet passed between routers."),
+    ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of external packets passed.")
+{
+}
+
+void
+RouterEngine::RouterEngineStat::regStats()
+{
+    using namespace statistics;
+
+    internalBlockedTraffic.init(router.gpnReqPorts.size());
+    externalBlockedTraffic.init(router.gptReqPorts.size());
+    internalAcceptedTraffic.init(router.gpnReqPorts.size());
+    externalAcceptedTraffic.init(router.gptReqPorts.size());
+}
+
 }// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 1cc1282d4a..caf013befa 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -132,52 +132,79 @@ class RouterEngine : public ClockedObject
         virtual void recvFunctional(PacketPtr pkt);
         virtual void recvRespRetry();
     };
+//    struct RouterEngineStats : public statistics::Group
+//     {
+//       RouterEngineStats(RouterEngine &router);
+
+//       RouterEngine &router;
+//       std::vector<statistics::Histogram *> m_internalTrafficCount;
+//     };
+    System* system;
+    CenteralController* centeralController;
+    bool handleRequest(PortID portId, PacketPtr pkt);
+    bool handleRemoteRequest(PortID portId, PacketPtr pkt);
+    void wakeUpInternal();
+    void wakeUpExternal();
+    void checkRetryExternal();
+    void checkRetryInternal();
+    std::vector<GPTReqPort> gptReqPorts;
+    std::vector<GPTRespPort> gptRespPorts;
+
+    std::vector<GPNReqPort> gpnReqPorts;
+    std::vector<GPNRespPort> gpnRespPorts;
+
+    std::unordered_map<PortID, AddrRangeList> gptAddrMap;
+    std::unordered_map<PortID, AddrRangeList> routerAddrMap;
+
+    std::unordered_map<PortID, std::queue<PacketPtr>> gptReqQueues;
+    std::unordered_map<PortID, std::queue<PacketPtr>> gpnRespQueues;
+
+    std::unordered_map<PortID, std::queue<PacketPtr>> gptRespQueues;
+    std::unordered_map<PortID, std::queue<PacketPtr>> gpnReqQueues;
+
+    const uint32_t gptQSize;
+    const uint32_t gpnQSize;
+    bool emptyQueues;
+    const Tick routerLatency;
+
+    EventFunctionWrapper nextGPTGPNEvent;
+    void processNextGPTGPNEvent();
+
+    EventFunctionWrapper nextInternalRequestEvent;
+    void processNextInternalRequestEvent();
+
+    EventFunctionWrapper nextGPNGPTEvent;
+    void processNextGPNGPTEvent();
+
+    EventFunctionWrapper nextExternalRequestEvent;
+    void processNextExternalRequestEvent();
+
+    struct RouterEngineStat : public statistics::Group
+    {
+      RouterEngineStat(RouterEngine &push);
 
-  System* system;
-  CenteralController* centeralController;
-  bool handleRequest(PortID portId, PacketPtr pkt);
-  bool handleRemoteRequest(PortID portId, PacketPtr pkt);
-  void wakeUpInternal();
-  void wakeUpExternal();
-  void checkRetryExternal();
-  void checkRetryInternal();
-  std::vector<GPTReqPort> gptReqPorts;
-  std::vector<GPTRespPort> gptRespPorts;
-
-  std::vector<GPNReqPort> gpnReqPorts;
-  std::vector<GPNRespPort> gpnRespPorts;
-
-  std::unordered_map<PortID, AddrRangeList> gptAddrMap;
-  std::unordered_map<PortID, AddrRangeList> routerAddrMap;
-
-  std::unordered_map<PortID, std::queue<PacketPtr>> gptReqQueues;
-  std::unordered_map<PortID, std::queue<PacketPtr>> gpnRespQueues;
-
-  std::unordered_map<PortID, std::queue<PacketPtr>> gptRespQueues;
-  std::unordered_map<PortID, std::queue<PacketPtr>> gpnReqQueues;
-
-  const uint32_t gptQSize;
-  const uint32_t gpnQSize;
-  bool emptyQueues;
-
-  EventFunctionWrapper nextGPTGPNEvent;
-  void processNextGPTGPNEvent();
-
-  EventFunctionWrapper nextInternalRequestEvent;
-  void processNextInternalRequestEvent();
+      void regStats() override;
 
-  EventFunctionWrapper nextGPNGPTEvent;
-  void processNextGPNGPTEvent();
+      RouterEngine &router;
 
-  EventFunctionWrapper nextExternalRequestEvent;
-  void processNextExternalRequestEvent();
+      statistics::Vector internalBlockedTraffic;
+      statistics::Vector externalBlockedTraffic;
+      statistics::Vector internalAcceptedTraffic;
+      statistics::Vector externalAcceptedTraffic;
+    };
 
-  // EventFunctionWrapper nextDoneSignalEvent;
-  // void processNextDoneSignalEvent();
+    RouterEngineStat stats;
 
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
+    // RouterEngineStats trafficStats;
+    // struct TrafficStats : public statistics::Group
+    // {
+    //   TrafficStats(RouterEngine &router);
+    //   std::vector<statistics::Histogram*> internalTrafficCount;
+    // } stats;
+
     void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
     virtual void startup() override;
@@ -191,6 +218,14 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
+    // virtual void collateStats();
+    // virtual void resetStats();
+    // virtual void regStats();
+
+    // statistics::Histogram& getInternalTrafficCount(uint32_t t)
+    // { return *(stats.internalTrafficCount[t]); }
+
+
 
 };
 

From e1f6cdec8c33b441c0aecff59567d6d3820ffdfe Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Thu, 2 Feb 2023 14:54:59 -0800
Subject: [PATCH 255/287] accl: improving the router scheduling

---
 src/accl/graph/sega/RouterEngine.py  |  2 +-
 src/accl/graph/sega/router_engine.cc | 63 +++++++++++++++++++++++++---
 src/accl/graph/sega/router_engine.hh |  6 ++-
 3 files changed, 63 insertions(+), 8 deletions(-)

diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
index b169bcd120..8182e81720 100644
--- a/src/accl/graph/sega/RouterEngine.py
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -43,5 +43,5 @@ class RouterEngine(ClockedObject):
     gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
     gpt_queue_size = Param.Int(64, "Queue size on the gpt side")
     gpn_queue_size = Param.Int(64, "Queue size on the gpt side")
-    router_latency = Param.Latency("5ns", "Router latency, "
+    router_latency = Param.Cycles(5, "Router latency, "
                                 "SerDes or E-O-E latencies can be added here")
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 57ae90b565..9a144afa04 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -72,6 +72,13 @@ RouterEngine::RouterEngine(const Params &params):
                     name() + ".gpn_resp_side" + std::to_string(i), this, i);
     }
 
+    for (int i = 0; i <params.port_gpt_req_side_connection_count; ++i) {
+        externalLatency[i] = curCycle();
+    }
+
+    for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
+        internalLatency[i] = curCycle();
+    }
     // for(int i = 0; i < gpnReqPorts.size(); i++){
     //     stats.internalTrafficCount.push_back(new statistics::Histogram());
     //     stats.internalTrafficCount[i]->init(10);
@@ -380,12 +387,18 @@ RouterEngine::processNextInternalRequestEvent()
     for (auto &queue: gpnRespQueues) {
         if (!queue.second.empty()) {
             if (!gpnReqPorts[queue.first].blocked()) {
+                if  ((curCycle() - 
+                    internalLatency[gpnReqPorts[queue.first].id()]) 
+                    < routerLatency) {
+                    continue;
+                } 
+                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
                     __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
                 gpnReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
-                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                internalLatency[gpnReqPorts[queue.first].id()] = curCycle();
             } 
             else {
                 stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
@@ -393,7 +406,6 @@ RouterEngine::processNextInternalRequestEvent()
             }
         }
     }
-
     for (auto &queue: gpnRespQueues) {
         if (!queue.second.empty()) {
             none_empty_queue = true;
@@ -408,8 +420,25 @@ RouterEngine::processNextInternalRequestEvent()
         DPRINTF(RouterEngine, "%s: The gpnRespQueues is empty.\n", __func__);
     }
 
+    Tick next_schedule = nextCycle() + cyclesToTicks(routerLatency);
+    for (auto itr = internalLatency.begin(); 
+            itr != internalLatency.end();
+            itr++)
+    {
+        if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
+            if ((itr->second + routerLatency) <  curCycle()) {
+                next_schedule =  nextCycle();
+                break;
+            } else {
+                next_schedule = std::min(
+                                    cyclesToTicks(itr->second + routerLatency),
+                                    next_schedule);
+            }
+        } 
+    }
+
     if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
-        schedule(nextInternalRequestEvent, curTick()+routerLatency);
+        schedule(nextInternalRequestEvent, next_schedule);
     }
 }
 
@@ -530,6 +559,12 @@ RouterEngine::processNextExternalRequestEvent()
     for (auto &queue: gptRespQueues) {
         if (!queue.second.empty()) {
             if (!gptReqPorts[queue.first].blocked()) {
+                if ((curCycle() - 
+                    externalLatency[gptReqPorts[queue.first].id()]) 
+                    < routerLatency) {
+                    continue;
+                }
+                stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. "
                         "the size is: %d.\n", __func__,
@@ -538,10 +573,10 @@ RouterEngine::processNextExternalRequestEvent()
                     __func__, pkt->getAddr(),gptReqPorts[queue.first].id());
                 gptReqPorts[queue.first].sendPacket(pkt);
                 queue.second.pop();
-                stats.externalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                externalLatency[gptReqPorts[queue.first].id()] = curCycle();
             }
              else {
-                stats.externalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
             //    trafficStats.m_internalTrafficCount[gpnReqPorts[queue.first].id()]->sample(queue.second.size());
             }
         }
@@ -563,9 +598,25 @@ RouterEngine::processNextExternalRequestEvent()
         DPRINTF(RouterEngine, "%s: The gptRespQueues is empty.\n", __func__);
     }
 
+    Tick next_schedule = cyclesToTicks(curCycle() + routerLatency);
+    for (auto itr = externalLatency.begin(); 
+        itr != externalLatency.end(); itr++)
+    {
+        if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
+            if ((itr->second + routerLatency) <  curCycle()) {
+                next_schedule =  nextCycle();
+                break;
+            } else {
+                next_schedule = std::min(
+                                    cyclesToTicks(itr->second + routerLatency),
+                                    next_schedule);
+            }
+        } 
+    }
+
     if (none_empty_queue) {
         if (!nextExternalRequestEvent.scheduled()) {
-            schedule(nextExternalRequestEvent, nextCycle());
+            schedule(nextExternalRequestEvent, next_schedule);
         }
     }
 }
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index caf013befa..dfd27b3c40 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -36,6 +36,7 @@
 #include "params/RouterEngine.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
+#include "base/types.hh"
 
 namespace gem5
 {
@@ -162,10 +163,13 @@ class RouterEngine : public ClockedObject
     std::unordered_map<PortID, std::queue<PacketPtr>> gptRespQueues;
     std::unordered_map<PortID, std::queue<PacketPtr>> gpnReqQueues;
 
+    std::unordered_map<PortID, Cycles> externalLatency;
+    std::unordered_map<PortID, Cycles> internalLatency;
+
     const uint32_t gptQSize;
     const uint32_t gpnQSize;
     bool emptyQueues;
-    const Tick routerLatency;
+    const Cycles routerLatency;
 
     EventFunctionWrapper nextGPTGPNEvent;
     void processNextGPTGPNEvent();

From 1ca3f184435463623a646708cb4a630387b5a62a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 6 Feb 2023 16:26:07 -0800
Subject: [PATCH 256/287] accl: Improving the router latency

Adjust the router latency based on each links last service
---
 configs/accl/async-pr.py             |  9 +++++-
 configs/accl/bfs.py                  |  9 +++++-
 configs/accl/sega_simple.py          |  3 +-
 configs/accl/sega_simple_pt2pt.py    |  9 +++---
 src/accl/graph/sega/router_engine.cc | 48 ++--------------------------
 src/accl/graph/sega/router_engine.hh | 22 -------------
 6 files changed, 25 insertions(+), 75 deletions(-)

diff --git a/configs/accl/async-pr.py b/configs/accl/async-pr.py
index 600a04c665..5bd4f76209 100644
--- a/configs/accl/async-pr.py
+++ b/configs/accl/async-pr.py
@@ -36,6 +36,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=str)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("alpha", type=float)
     argparser.add_argument("threshold", type=float)
@@ -70,6 +72,8 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
         args.graph,
         args.alpha,
         args.threshold,
@@ -84,6 +88,8 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
         graph,
         alpha,
         threshold,
@@ -96,7 +102,8 @@ def get_inputs():
         from sega_simple_pt2pt import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size, 
+                                        r_queue_size, r_latency, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 0b63088bff..6c3384a2d7 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -36,6 +36,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
@@ -86,6 +88,8 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
         args.graph,
         args.init_addr,
         args.init_value,
@@ -102,6 +106,8 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
         graph,
         init_addr,
         init_value,
@@ -119,7 +125,8 @@ def get_inputs():
             from sega_simple import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size,
+                                                r_queue_size, r_latency, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index ce7ad982e6..f521d4090c 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -127,7 +127,8 @@ def setPort(self, port):
 
 
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size,
+                                        r_queue_size, r_latency, graph_path):
         super(SEGA, self).__init__()
         # num_gpts should be an even power of 2
         assert num_gpts != 0
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index a236e5c45c..2d20903387 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -127,7 +127,8 @@ def setPort(self, port):
 
 
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size, 
+                                        r_queue_size, r_latency, graph_path):
         super(SEGA, self).__init__()
         # num_gpts should be an even power of 2
         assert num_gpts != 0
@@ -169,9 +170,9 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         for i in range(int(num_gpts/GPTPerGPN)):
             routers.append(
                         RouterEngine(
-                                gpn_queue_size = 16,
-                                gpt_queue_size = 16,
-                                router_latency = "1ns"))
+                                gpn_queue_size = r_queue_size,
+                                gpt_queue_size = r_queue_size,
+                                router_latency = r_latency))
         self.routers = routers
         # for gpt_0 in gpts:
         #     for gpt_1 in gpts:
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 9a144afa04..df0fedd69e 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -79,11 +79,6 @@ RouterEngine::RouterEngine(const Params &params):
     for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
         internalLatency[i] = curCycle();
     }
-    // for(int i = 0; i < gpnReqPorts.size(); i++){
-    //     stats.internalTrafficCount.push_back(new statistics::Histogram());
-    //     stats.internalTrafficCount[i]->init(10);
-    // }
-    // statistics::registerDumpCallback([this]() { collateStats(); });
 }
 
 void
@@ -402,7 +397,6 @@ RouterEngine::processNextInternalRequestEvent()
             } 
             else {
                 stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
-            //    trafficStats.m_internalTrafficCount[gpnReqPorts[queue.first].id()]->sample(queue.second.size());
             }
         }
     }
@@ -577,7 +571,6 @@ RouterEngine::processNextExternalRequestEvent()
             }
              else {
                 stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
-            //    trafficStats.m_internalTrafficCount[gpnReqPorts[queue.first].id()]->sample(queue.second.size());
             }
         }
     }
@@ -662,43 +655,6 @@ RouterEngine::checkGPNRetryReq()
     }
 }
 
-// RouterEngine::
-// RouterEngineStats::RouterEngineStats(RouterEngine &_router)
-//     : statistics::Group(&_router),
-//     router(_router)
-// {
-
-//     for (int i = 0; i < router.gpnReqPorts.size(); i++) {
-//         m_internalTrafficCount.push_back(new statistics::Histogram(this));
-//         m_internalTrafficCount[i]
-//             ->init(0)
-//             .flags(statistics::nozero);
-//     }
-// }
-
-// void RouterEngine::resetStats()
-// {
-//     for (int i = 0; i < gpnReqPorts.size(); i++) {
-//         trafficStats.m_internalTrafficCount[i]->reset();
-//     }
-// }
-
-// void
-// RouterEngine::regStats()
-// {
-//     ClockedObject::regStats();
-// }
-
-// void
-// RouterEngine::collateStats()
-// {
-//     for (uint32_t j = 0; j < gpnReqPorts.size(); ++j) {
-//                         trafficStats
-//                         .m_internalTrafficCount[j];
-//                         // ->add(getInternalTrafficCount(j));
-//     }
-// }
-
 RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
     : statistics::Group(&_router),
     router(_router),
@@ -710,8 +666,7 @@ RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
              "Number of packet passed between routers."),
     ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
              "Number of external packets passed.")
-{
-}
+{}
 
 void
 RouterEngine::RouterEngineStat::regStats()
@@ -722,6 +677,7 @@ RouterEngine::RouterEngineStat::regStats()
     externalBlockedTraffic.init(router.gptReqPorts.size());
     internalAcceptedTraffic.init(router.gpnReqPorts.size());
     externalAcceptedTraffic.init(router.gptReqPorts.size());
+
 }
 
 }// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index dfd27b3c40..0395648cf3 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -133,13 +133,7 @@ class RouterEngine : public ClockedObject
         virtual void recvFunctional(PacketPtr pkt);
         virtual void recvRespRetry();
     };
-//    struct RouterEngineStats : public statistics::Group
-//     {
-//       RouterEngineStats(RouterEngine &router);
 
-//       RouterEngine &router;
-//       std::vector<statistics::Histogram *> m_internalTrafficCount;
-//     };
     System* system;
     CenteralController* centeralController;
     bool handleRequest(PortID portId, PacketPtr pkt);
@@ -196,18 +190,10 @@ class RouterEngine : public ClockedObject
       statistics::Vector internalAcceptedTraffic;
       statistics::Vector externalAcceptedTraffic;
     };
-
     RouterEngineStat stats;
-
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
-    // RouterEngineStats trafficStats;
-    // struct TrafficStats : public statistics::Group
-    // {
-    //   TrafficStats(RouterEngine &router);
-    //   std::vector<statistics::Histogram*> internalTrafficCount;
-    // } stats;
 
     void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
@@ -222,14 +208,6 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
-    // virtual void collateStats();
-    // virtual void resetStats();
-    // virtual void regStats();
-
-    // statistics::Histogram& getInternalTrafficCount(uint32_t t)
-    // { return *(stats.internalTrafficCount[t]); }
-
-
 
 };
 

From abc37824a501ac6603b6f39c79fd56b228cb0827 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 6 Feb 2023 16:30:57 -0800
Subject: [PATCH 257/287] Adding the vector of histogram (not working)

---
 src/accl/graph/sega/router_engine.cc | 15 +++++++++++++--
 src/accl/graph/sega/router_engine.hh |  8 ++++++--
 2 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index df0fedd69e..09575365fe 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -397,6 +397,8 @@ RouterEngine::processNextInternalRequestEvent()
             } 
             else {
                 stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.internalTrafficHist[gpnReqPorts[queue.first].id()]->
+                                                sample(queue.second.size());
             }
         }
     }
@@ -666,7 +668,13 @@ RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
              "Number of packet passed between routers."),
     ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
              "Number of external packets passed.")
-{}
+{
+    for (int i = 0; i < router.gpnReqPorts.size(); i++) {
+        internalTrafficHist.push_back(new statistics::Histogram(
+            this, "internalTrafficHist", 
+            statistics::units::Count::get()));
+    }
+}
 
 void
 RouterEngine::RouterEngineStat::regStats()
@@ -678,6 +686,9 @@ RouterEngine::RouterEngineStat::regStats()
     internalAcceptedTraffic.init(router.gpnReqPorts.size());
     externalAcceptedTraffic.init(router.gptReqPorts.size());
 
+    for (int i = 0; i < router.gpnReqPorts.size(); i++) {
+        internalTrafficHist[i]->init(10);
+    }
 }
 
-}// namespace gem5
+}// namespace gem5
\ No newline at end of file
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 0395648cf3..661ace4a06 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -189,8 +189,10 @@ class RouterEngine : public ClockedObject
       statistics::Vector externalBlockedTraffic;
       statistics::Vector internalAcceptedTraffic;
       statistics::Vector externalAcceptedTraffic;
+      // std::vector<statistics::Histogram> internalTrafficHist;
+      std::vector<statistics::Histogram *> internalTrafficHist;
     };
-    RouterEngineStat stats;
+
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
@@ -208,9 +210,11 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
+    void collateStats();
+    RouterEngineStat stats;
 
 };
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
+#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
\ No newline at end of file

From 993c37d552115bbe57c9f39284b4db18437207fc Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 28 Mar 2023 19:28:44 -0700
Subject: [PATCH 258/287] Randomizing retry sending order.

---
 src/accl/graph/sega/wl_engine.cc | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index d563450179..442d051e43 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -28,6 +28,10 @@
 
 #include "accl/graph/sega/wl_engine.hh"
 
+#include <algorithm>
+#include <random>
+#include <vector>
+
 #include "accl/graph/sega/mpu.hh"
 #include "debug/SEGAStructureSize.hh"
 #include "debug/WLEngine.hh"
@@ -135,8 +139,16 @@ WLEngine::RespPort::recvRespRetry()
 void
 WLEngine::checkRetryReq()
 {
+    std::vector<int> random_shuffle;
+    for (int i = 0; i < inPorts.size(); i++) {
+        random_shuffle.push_back(i);
+    }
+    std::random_device rd;
+    std::mt19937 gen(rd());
+    std::shuffle(random_shuffle.begin(), random_shuffle.end(), gen);
+
     for (int i = 0; i < inPorts.size(); i++) {
-        inPorts[i].checkRetryReq();
+        inPorts[random_shuffle[i]].checkRetryReq();
     }
 }
 

From 181a6fa9acac4debd7c77844e11d2bdfb22891ff Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 28 Mar 2023 19:17:46 -0700
Subject: [PATCH 259/287] Improving wlengine model.

---
 configs/accl/sega.py             |  18 ++-
 configs/accl/sega_simple.py      |   6 +-
 src/accl/graph/sega/WLEngine.py  |   4 +
 src/accl/graph/sega/enums.cc     |   6 +
 src/accl/graph/sega/enums.hh     |   9 ++
 src/accl/graph/sega/wl_engine.cc | 267 ++++++++++++++++++++-----------
 src/accl/graph/sega/wl_engine.hh |  14 +-
 7 files changed, 228 insertions(+), 96 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 32d0dd26ab..dc7dbabb70 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -50,7 +50,11 @@ class GPT(SubSystem):
     def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            rd_per_cycle=2,
+            reduce_per_cycle=32,
+            wr_per_cycle=2,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -109,6 +113,7 @@ def set_vertex_range(self, vertex_ranges):
     def set_vertex_pch_bit(self, pch_bit):
         self.vertex_mem_ctrl.pch_bit = pch_bit
 
+
 class EdgeMemory(SubSystem):
     def __init__(self, size: str):
         super(EdgeMemory, self).__init__()
@@ -133,6 +138,7 @@ def getPort(self):
     def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
+
 class SEGA(System):
     def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         super(SEGA, self).__init__()
@@ -148,10 +154,12 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.mem_mode = "timing"
 
         # Building the CenteralController
-        self.ctrl = CenteralController(vertex_image_file=f"{graph_path}/vertices")
+        self.ctrl = CenteralController(
+            vertex_image_file=f"{graph_path}/vertices"
+        )
         # Building the EdgeMemories
         edge_mem = []
-        for i in range(int(num_gpts/2)):
+        for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
@@ -167,7 +175,9 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
                 [vertex_ranges[i], vertex_ranges[i + num_gpts]]
             )
             gpt.set_vertex_pch_bit(pch_bit)
-            gpt.setEdgeMemPort(self.edge_mem[i % (int(num_gpts/2))].getPort())
+            gpt.setEdgeMemPort(
+                self.edge_mem[i % (int(num_gpts / 2))].getPort()
+            )
             gpts.append(gpt)
         # Creating the interconnect among mpus
         for gpt_0 in gpts:
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index f521d4090c..b389d7e3e7 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -50,7 +50,11 @@ class GPT(SubSystem):
     def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            rd_per_cycle=2,
+            reduce_per_cycle=32,
+            wr_per_cycle=2,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 5a8ed9c9fd..0940e6b718 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -44,3 +44,7 @@ class WLEngine(BaseReduceEngine):
                                     "WLEngine has. It can service as "
                                     "many updates as this queueu has "
                                     "entries at the same time.")
+
+    rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
+    reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
+    wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index f7ef96197f..2f1bc983eb 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -31,6 +31,12 @@
 namespace gem5
 {
 
+const char* registerStateStrings[NUM_REGISTER_STATE] = {
+    "PENDING_READ",
+    "PENDING_REDUCE",
+    "PENDING_WRITE"
+};
+
 const char* cacheStateStrings[NUM_CACHE_STATE] = {
     "INVALID",
     "PENDING_DATA",
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index f97c33a0e0..4e7d64235e 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -32,6 +32,15 @@
 namespace gem5
 {
 
+enum RegisterState
+{
+    PENDING_READ,
+    PENDING_REDUCE,
+    PENDING_WRITE,
+    NUM_REGISTER_STATE
+};
+extern const char* registerStateStrings[NUM_REGISTER_STATE];
+
 enum CacheState
 {
     INVALID,
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 442d051e43..cf9599aeef 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -44,9 +44,13 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
+    maxReadsPerCycle(params.rd_per_cycle),
+    maxReducesPerCycle(params.reduce_per_cycle),
+    maxWritesPerCycle(params.wr_per_cycle),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
+    nextWriteEvent([this] { processNextWriteEvent(); }, name()),
     nextDoneSignalEvent([this] { processNextDoneSignalEvent(); }, name()),
     stats(*this)
 {
@@ -190,89 +194,112 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
 void
 WLEngine::processNextReadEvent()
 {
-    Addr update_addr;
-    uint32_t update_value;
-    Tick enter_tick;
-    std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
-
-    DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
+    int num_reads = 0;
+    while (true) {
+        Addr update_addr;
+        uint32_t update_value;
+        Tick enter_tick;
+        std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
+
+        DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
 
-    if ((registerFile.find(update_addr) == registerFile.end())) {
-        DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
-                            "in registerFile.\n", __func__, update_addr);
-        if (registerFile.size() < registerFileSize) {
-            DPRINTF(WLEngine, "%s: There are free registers available in the "
-                                            "registerFile.\n", __func__);
-            ReadReturnStatus read_status = owner->recvWLRead(update_addr);
-            if (read_status == ReadReturnStatus::ACCEPT) {
-                DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
-                            "request to addr: %lu.\n", __func__, update_addr);
-                registerFile[update_addr] = update_value;
-                DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
-                        "to registerFile. registerFile.size = %d, "
-                        "registerFileSize = %d.\n", __func__, update_addr,
-                        update_value, registerFile.size(), registerFileSize);
-                DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
-                        "to registerFile. registerFile.size = %d, "
-                        "registerFileSize = %d.\n", __func__, update_addr,
-                        update_value, registerFile.size(), registerFileSize);
+        if ((registerFile.find(update_addr) == registerFile.end())) {
+            DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
+                                "in registerFile.\n", __func__, update_addr);
+            if (registerFile.size() < registerFileSize) {
+                DPRINTF(WLEngine, "%s: There are free registers available in the "
+                                                "registerFile.\n", __func__);
+                ReadReturnStatus read_status = owner->recvWLRead(update_addr);
+                if (read_status == ReadReturnStatus::ACCEPT) {
+                    DPRINTF(WLEngine, "%s: CoalesceEngine returned true for read "
+                                "request to addr: %lu.\n", __func__, update_addr);
+                    registerFile[update_addr] = std::make_tuple(RegisterState::PENDING_READ, update_value);
+                    DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    DPRINTF(WLEngine, "%s: Added (addr: %lu, value: %u) "
+                            "to registerFile. registerFile.size = %d, "
+                            "registerFileSize = %d.\n", __func__, update_addr,
+                            update_value, registerFile.size(), registerFileSize);
+                    updateQueue.pop_front();
+                    stats.updateQueueLatency.sample(
+                            (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                    DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
+                                "from updateQueue. updateQueue.size = %d. "
+                                "updateQueueSize = %d.\n", __func__, update_addr,
+                                update_value, updateQueue.size(), updateQueueSize);
+                    vertexReadTime[update_addr] = curTick();
+                    checkRetryReq();
+                } else {
+                    if (read_status == ReadReturnStatus::REJECT_ROLL) {
+                        updateQueue.pop_front();
+                        updateQueue.emplace_back(
+                                            update_addr, update_value, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                            "Rolling the update.\n", __func__);
+                        stats.numUpdateRolls++;
+                    } else {
+                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
+                                        "Not rolling the update.\n", __func__);
+                    }
+                }
+            } else {
+                DPRINTF(WLEngine, "%s: There are no free registers "
+                        "available in the registerFile.\n", __func__);
+                stats.registerShortage++;
+            }
+        } else {
+            RegisterState state = std::get<0>(registerFile[update_addr]);
+            if (state == RegisterState::PENDING_WRITE) {
+                // NOTE: If it's pending write, let it be written.
+                updateQueue.pop_front();
+                updateQueue.emplace_back(update_addr, update_value, enter_tick);
+            } else {
+                DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                            "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
+                        __func__, update_addr, update_addr, std::get<1>(registerFile[update_addr]));
+                uint32_t curr_value = std::get<1>(registerFile[update_addr]);
+                uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
+                registerFile[update_addr] = std::make_tuple(state, new_value);
+                DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
+                            " registerFile. registerFile[%lu] = %u.\n", __func__,
+                            update_value, update_addr, std::get<1>(registerFile[update_addr]));
+                stats.registerFileCoalesce++;
                 updateQueue.pop_front();
                 stats.updateQueueLatency.sample(
-                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
+                                (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
-                            "from updateQueue. updateQueue.size = %d. "
-                            "updateQueueSize = %d.\n", __func__, update_addr,
-                            update_value, updateQueue.size(), updateQueueSize);
+                                    "from updateQueue. updateQueue.size = %d. "
+                                    "updateQueueSize = %d.\n", __func__, update_addr,
+                                    update_value, updateQueue.size(), updateQueueSize);
                 DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
                 checkRetryReq();
-                vertexReadTime[update_addr] = curTick();
-            } else {
-                if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                    updateQueue.pop_front();
-                    updateQueue.emplace_back(
-                                        update_addr, update_value, enter_tick);
-                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                        "Rolling the update.\n", __func__);
-                    stats.numUpdateRolls++;
-                } else {
-                    DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                    "Not rolling the update.\n", __func__);
-                }
             }
-        } else {
-            DPRINTF(WLEngine, "%s: There are no free registers "
-                    "available in the registerFile.\n", __func__);
-            stats.registerShortage++;
         }
-    } else {
-        DPRINTF(WLEngine,  "%s: A register has already been allocated for "
-                    "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
-                __func__, update_addr, update_addr, registerFile[update_addr]);
-        registerFile[update_addr] =
-                graphWorkload->reduce(update_value, registerFile[update_addr]);
-        DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
-                    " registerFile. registerFile[%lu] = %u.\n", __func__,
-                    update_value, update_addr, registerFile[update_addr]);
-        stats.registerFileCoalesce++;
-        updateQueue.pop_front();
-        stats.updateQueueLatency.sample(
-                        (curTick() - enter_tick) * 1e9 / getClockFrequency());
-        DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
-                            "from updateQueue. updateQueue.size = %d. "
-                            "updateQueueSize = %d.\n", __func__, update_addr,
-                            update_value, updateQueue.size(), updateQueueSize);
-        DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
-                    "from updateQueue. updateQueue.size = %d. "
-                    "updateQueueSize = %d.\n", __func__, update_addr,
-                    update_value, updateQueue.size(), updateQueueSize);
-        checkRetryReq();
+
+        num_reads++;
+        if (num_reads >= maxReadsPerCycle) {
+            // NOTE: Add stat here to count read port shortage.
+            break;
+        }
+        if (updateQueue.empty()) {
+            break;
+        }
     }
 
-    if (!updateQueue.empty() && (!nextReadEvent.scheduled())) {
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+    if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());
     }
 }
@@ -281,6 +308,7 @@ void
 WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 {
     assert(workListFile.size() <= registerFileSize);
+    assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_READ);
 
     workListFile[addr] = wl;
     DPRINTF(SEGAStructureSize, "%s: Added (addr: %lu, wl: %s) to "
@@ -290,11 +318,14 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                 graphWorkload->printWorkListItem(wl), workListFile.size());
 
+    uint32_t value = std::get<0>(registerFile[addr]);
+    registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
+    toReduce.push_back(addr);
+
     stats.vertexReadLatency.sample(
         ((curTick() - vertexReadTime[addr]) * 1e9) / getClockFrequency());
     vertexReadTime.erase(addr);
 
-    assert(!workListFile.empty());
     if (!nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -303,35 +334,93 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-    for (auto &it : workListFile) {
-        Addr addr = it.first;
-        assert(registerFile.find(addr) != registerFile.end());
-        uint32_t update_value = registerFile[addr];
-        DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
-                    ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-                    __func__, addr, registerFile[addr], addr,
-                    graphWorkload->printWorkListItem(workListFile[addr]));
-        // TODO: Generalize this to reduce function rather than just min
+
+    // for (auto &it : workListFile) {
+    //     Addr addr = it.first;
+    //     assert(registerFile.find(addr) != registerFile.end());
+    //     uint32_t update_value = registerFile[addr];
+    //     DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
+    //                 ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
+    //                 __func__, addr, registerFile[addr], addr,
+    //                 graphWorkload->printWorkListItem(workListFile[addr]));
+    //     // TODO: Generalize this to reduce function rather than just min
+    //     workListFile[addr].tempProp =
+    //         graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+    //     DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
+    //     __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
+    //     stats.numReduce++;
+
+    //     owner->recvWLWrite(addr, workListFile[addr]);
+    //     registerFile.erase(addr);
+    //     DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
+    //                 "registerFile.size = %d, registerFileSize = %d\n",
+    //                 __func__, addr, registerFile.size(), registerFileSize);
+    //     DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
+    //                 "registerFile.size = %d, registerFileSize = %d\n",
+    //                 __func__, addr, registerFile.size(), registerFileSize);
+    // }
+    // workListFile.clear();
+
+    int num_reduces = 0;
+    while (true) {
+        Addr addr = toReduce.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
+        uint32_t update_value = std::get<1>(registerFile[addr]);
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
-        DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-        __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
-        stats.numReduce++;
+        registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
+        num_reduces++;
+        toReduce.pop_front();
+        toWrite.push_back(addr);
+
+        if (num_reduces >= maxReducesPerCycle) {
+            // TODO: Add stat to count reducer shortage;
+            break;
+        }
+        if (toReduce.empty()) {
+            break;
+        }
+    }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
 
+    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
+        schedule(nextReduceEvent, nextCycle());
+    }
+    // if (done() && !nextDoneSignalEvent.scheduled()) {
+    //     schedule(nextDoneSignalEvent, nextCycle());
+    // }
+}
+
+void
+WLEngine::processNextWriteEvent()
+{
+    int num_writes = 0;
+    while (true) {
+        Addr addr = toWrite.front();
+        assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_WRITE);
         owner->recvWLWrite(addr, workListFile[addr]);
         registerFile.erase(addr);
-        DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
-                    "registerFile.size = %d, registerFileSize = %d\n",
-                    __func__, addr, registerFile.size(), registerFileSize);
-        DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
-                    "registerFile.size = %d, registerFileSize = %d\n",
-                    __func__, addr, registerFile.size(), registerFileSize);
+        workListFile.erase(addr);
+        toWrite.pop_front();
+        num_writes++;
+        if (num_writes >= maxWritesPerCycle) {
+            break;
+        }
+        if (toWrite.empty()) {
+            break;
+        }
     }
-    workListFile.clear();
 
     if (done() && !nextDoneSignalEvent.scheduled()) {
         schedule(nextDoneSignalEvent, nextCycle());
     }
+
+    if (!toWrite.empty() && !nextWriteEvent.scheduled()) {
+        schedule(nextWriteEvent, nextCycle());
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index fb147e692a..bd32b16d9d 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -79,10 +79,17 @@ class WLEngine : public BaseReduceEngine
     int updateQueueSize;
     std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
 
+    int maxReadsPerCycle;
+    int maxReducesPerCycle;
+    int maxWritesPerCycle;
+
     int registerFileSize;
-    std::unordered_map<Addr, uint32_t> registerFile;
-    std::unordered_map<Addr, Tick> vertexReadTime;
+    std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;
+    std::deque<Addr> toReduce;
+    std::deque<Addr> toWrite;
+
+    std::unordered_map<Addr, Tick> vertexReadTime;
 
     EventFunctionWrapper nextReadEvent;
     void processNextReadEvent();
@@ -90,6 +97,9 @@ class WLEngine : public BaseReduceEngine
     EventFunctionWrapper nextReduceEvent;
     void processNextReduceEvent();
 
+    EventFunctionWrapper nextWriteEvent;
+    void processNextWriteEvent();
+
     EventFunctionWrapper nextDoneSignalEvent;
     void processNextDoneSignalEvent();
 

From e6d68749e102d424b7a07db73e939e6a8c97b8bf Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 1 Apr 2023 14:35:52 -0700
Subject: [PATCH 260/287] Improving vertex access time by improving updateQeueu
 reads + more stats

---
 configs/accl/sega.py             | 13 +++-----
 src/accl/graph/sega/WLEngine.py  |  2 ++
 src/accl/graph/sega/wl_engine.cc | 53 ++++++++++++++++++++++++++------
 src/accl/graph/sega/wl_engine.hh |  5 +++
 4 files changed, 55 insertions(+), 18 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index dc7dbabb70..58a8caddde 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -52,9 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
-            rd_per_cycle=2,
+            rd_per_cycle=4,
             reduce_per_cycle=32,
-            wr_per_cycle=2,
+            wr_per_cycle=4,
+            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -73,12 +74,8 @@ def __init__(self, register_file_size: int, cache_size: str):
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
-            dram=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
-            dram_2=HBM_2000_4H_1x64(
-                page_policy="close", read_buffer_size=96, write_buffer_size=96
-            ),
+            dram=HBM_2000_4H_1x64(),
+            dram_2=HBM_2000_4H_1x64(),
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index 0940e6b718..cfec70081d 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -48,3 +48,5 @@ class WLEngine(BaseReduceEngine):
     rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
     reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
     wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
+    
+    num_updates_processed = Param.Int("Maximum number of updates processed")
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index cf9599aeef..276fcd1281 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -47,6 +47,7 @@ WLEngine::WLEngine(const WLEngineParams& params):
     maxReadsPerCycle(params.rd_per_cycle),
     maxReducesPerCycle(params.reduce_per_cycle),
     maxWritesPerCycle(params.wr_per_cycle),
+    maxUpdatesProcessed(params.num_updates_processed),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
@@ -171,6 +172,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     }
 
     updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
+    stats.numberIncomingUpdaes++;
     DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
                 "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
                 __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
@@ -195,11 +197,23 @@ void
 WLEngine::processNextReadEvent()
 {
     int num_reads = 0;
+    int num_tries = 0;
+    std::deque<std::tuple<Addr, uint32_t, Tick>> tempQueue;
+
+    for (int i = 0; i < maxUpdatesProcessed; i++) {
+        if (updateQueue.empty()) {
+            break;
+        }
+        tempQueue.push_back(updateQueue.front());
+        updateQueue.pop_front();
+    }
+
     while (true) {
+        num_tries += 1;
         Addr update_addr;
         uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = updateQueue.front();
+        std::tie(update_addr, update_value, enter_tick) = tempQueue.front();
 
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -223,13 +237,14 @@ WLEngine::processNextReadEvent()
                             "to registerFile. registerFile.size = %d, "
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
-                    updateQueue.pop_front();
+                    tempQueue.pop_front();
+                    num_reads++;
                     stats.updateQueueLatency.sample(
                             (curTick() - enter_tick) * 1e9 / getClockFrequency());
                     DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
-                                update_value, updateQueue.size(), updateQueueSize);
+                                update_value, tempQueue.size(), updateQueueSize);
                     DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
@@ -238,8 +253,8 @@ WLEngine::processNextReadEvent()
                     checkRetryReq();
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                        updateQueue.pop_front();
-                        updateQueue.emplace_back(
+                        tempQueue.pop_front();
+                        tempQueue.emplace_back(
                                             update_addr, update_value, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
@@ -258,8 +273,8 @@ WLEngine::processNextReadEvent()
             RegisterState state = std::get<0>(registerFile[update_addr]);
             if (state == RegisterState::PENDING_WRITE) {
                 // NOTE: If it's pending write, let it be written.
-                updateQueue.pop_front();
-                updateQueue.emplace_back(update_addr, update_value, enter_tick);
+                tempQueue.pop_front();
+                tempQueue.emplace_back(update_addr, update_value, enter_tick);
             } else {
                 DPRINTF(WLEngine,  "%s: A register has already been allocated for "
                             "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
@@ -271,7 +286,7 @@ WLEngine::processNextReadEvent()
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
                 stats.registerFileCoalesce++;
-                updateQueue.pop_front();
+                tempQueue.pop_front();
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
@@ -286,16 +301,26 @@ WLEngine::processNextReadEvent()
             }
         }
 
-        num_reads++;
+        // num_reads++;
         if (num_reads >= maxReadsPerCycle) {
             // NOTE: Add stat here to count read port shortage.
+            stats.numReadPortShortage++;
             break;
         }
-        if (updateQueue.empty()) {
+        if (num_tries > maxUpdatesProcessed) {
+            break;
+        }
+
+        if (tempQueue.empty()) {
             break;
         }
     }
 
+    for (int i = 0; i < tempQueue.size(); i++){
+        updateQueue.push_front(tempQueue.back());
+        tempQueue.pop_back();
+    }
+
     if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
@@ -407,6 +432,7 @@ WLEngine::processNextWriteEvent()
         toWrite.pop_front();
         num_writes++;
         if (num_writes >= maxWritesPerCycle) {
+            stats.numWritePortShortage++;
             break;
         }
         if (toWrite.empty()) {
@@ -444,6 +470,12 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     ADD_STAT(numUpdateRolls, statistics::units::Count::get(),
              "Number of times an update has been rolled back "
              "to the back of the update queue due to cache reject."),
+    ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
+             "Number of times limited by read per cycle."),
+    ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
+             "Number of times limited by write per cycle."),
+    ADD_STAT(numberIncomingUpdaes, statistics::units::Count::get(),
+              "Number of inocoming updates for each GPT."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
     ADD_STAT(updateQueueLatency, statistics::units::Second::get(),
@@ -458,6 +490,7 @@ WLEngine::WorkListStats::regStats()
 
     vertexReadLatency.init(64);
     updateQueueLatency.init(64);
+
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index bd32b16d9d..8f55ecadd4 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -83,6 +83,8 @@ class WLEngine : public BaseReduceEngine
     int maxReducesPerCycle;
     int maxWritesPerCycle;
 
+    int maxUpdatesProcessed;
+
     int registerFileSize;
     std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;
@@ -115,6 +117,9 @@ class WLEngine : public BaseReduceEngine
       statistics::Scalar registerFileCoalesce;
       statistics::Scalar registerShortage;
       statistics::Scalar numUpdateRolls;
+      statistics::Scalar numReadPortShortage;
+      statistics::Scalar numWritePortShortage;
+      statistics::Scalar numberIncomingUpdaes;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From 964722188b1f2553f9a1f86e8675e74ce9b3ae3f Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Sat, 1 Apr 2023 15:05:15 -0700
Subject: [PATCH 261/287] Cleaning up wl_engine.cc

---
 src/accl/graph/sega/wl_engine.cc | 34 --------------------------------
 1 file changed, 34 deletions(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 276fcd1281..69e874c0d6 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -191,8 +191,6 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
     return true;
 }
 
-// TODO: Parameterize the number of pops WLEngine can do at a time.
-// TODO: Add a histogram stats of the size of the updateQueue. Sample here.
 void
 WLEngine::processNextReadEvent()
 {
@@ -301,9 +299,7 @@ WLEngine::processNextReadEvent()
             }
         }
 
-        // num_reads++;
         if (num_reads >= maxReadsPerCycle) {
-            // NOTE: Add stat here to count read port shortage.
             stats.numReadPortShortage++;
             break;
         }
@@ -359,33 +355,6 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
 void
 WLEngine::processNextReduceEvent()
 {
-
-    // for (auto &it : workListFile) {
-    //     Addr addr = it.first;
-    //     assert(registerFile.find(addr) != registerFile.end());
-    //     uint32_t update_value = registerFile[addr];
-    //     DPRINTF(WLEngine,  "%s: Reducing between registerFile and workListFile"
-    //                 ". registerFile[%lu] = %u, workListFile[%lu] = %s.\n",
-    //                 __func__, addr, registerFile[addr], addr,
-    //                 graphWorkload->printWorkListItem(workListFile[addr]));
-    //     // TODO: Generalize this to reduce function rather than just min
-    //     workListFile[addr].tempProp =
-    //         graphWorkload->reduce(update_value, workListFile[addr].tempProp);
-    //     DPRINTF(WLEngine,  "%s: Reduction done. workListFile[%lu] = %s.\n",
-    //     __func__, addr, graphWorkload->printWorkListItem(workListFile[addr]));
-    //     stats.numReduce++;
-
-    //     owner->recvWLWrite(addr, workListFile[addr]);
-    //     registerFile.erase(addr);
-    //     DPRINTF(SEGAStructureSize, "%s: Removed addr: %lu from registerFile. "
-    //                 "registerFile.size = %d, registerFileSize = %d\n",
-    //                 __func__, addr, registerFile.size(), registerFileSize);
-    //     DPRINTF(WLEngine, "%s: Removed addr: %lu from registerFile. "
-    //                 "registerFile.size = %d, registerFileSize = %d\n",
-    //                 __func__, addr, registerFile.size(), registerFileSize);
-    // }
-    // workListFile.clear();
-
     int num_reduces = 0;
     while (true) {
         Addr addr = toReduce.front();
@@ -414,9 +383,6 @@ WLEngine::processNextReduceEvent()
     if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
         schedule(nextReduceEvent, nextCycle());
     }
-    // if (done() && !nextDoneSignalEvent.scheduled()) {
-    //     schedule(nextDoneSignalEvent, nextCycle());
-    // }
 }
 
 void

From 1acd6d18ba26e464b03676c65f23be631549b271 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Oct 2022 14:10:57 -0700
Subject: [PATCH 262/287] Fixing done, code style and conifg. Adding a stat.

---
 src/accl/graph/sega/push_engine.cc | 24 ++++++++++++++++++++++++
 1 file changed, 24 insertions(+)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 4703e27d16..d3dc381625 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -53,11 +53,16 @@ PushEngine::PushEngine(const Params& params):
 {
     destinationQueues.clear();
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
+<<<<<<< HEAD
         outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
         destinationQueues.emplace_back();
         destinationQueues[i].clear();
         sourceAndValueMaps.emplace_back();
         sourceAndValueMaps[i].clear();
+=======
+        outPorts.emplace_back(
+                        name() + ".out_ports" + std::to_string(i), this, i);
+>>>>>>> Fixing done, code style and conifg. Adding a stat.
     }
 }
 
@@ -154,7 +159,26 @@ PushEngine::done()
 {
     bool empty_update_queues = true;
     for (int i = 0; i < outPorts.size(); i++) {
+<<<<<<< HEAD
         empty_update_queues &= destinationQueues[i].empty();
+=======
+        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
+    }
+    return empty_update_queues && edgeQueue.empty() &&
+        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
+}
+
+
+uint32_t
+PushEngine::propagate(uint32_t value, uint32_t weight)
+{
+    uint32_t update;
+    if (workload == "BFS")  {
+        update = value + 1;
+    }
+    else{
+        panic("The workload %s is not supported", workload);
+>>>>>>> Fixing done, code style and conifg. Adding a stat.
     }
     return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();

From 8620ccd6154f2cabda3564a57d54f8d8ec993eb1 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sat, 1 Apr 2023 22:21:29 -0700
Subject: [PATCH 263/287] Adding the router latency as entry, fixing some
 siddues with scheduling, todo: add stats

---
 configs/accl/pr.py                   |  9 ++-
 configs/accl/sega_simple_pt2pt.py    |  7 ++-
 src/accl/graph/sega/push_engine.cc   | 30 +---------
 src/accl/graph/sega/router_engine.cc | 87 +++++++++++++---------------
 src/accl/graph/sega/router_engine.hh | 27 ++++-----
 5 files changed, 68 insertions(+), 92 deletions(-)

diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index af8669775b..a7f30f02e9 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -36,6 +36,8 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
@@ -72,6 +74,8 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
         args.graph,
         args.iterations,
         args.alpha,
@@ -88,6 +92,8 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
         graph,
         iterations,
         alpha,
@@ -104,7 +110,8 @@ def get_inputs():
         from sega_simple_pt2pt import SEGA
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+    system = SEGA(num_gpts, num_registers, cache_size,
+                                                r_queue_size, r_latency, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index 2d20903387..9c2dd17481 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -50,7 +50,12 @@ class GPT(SubSystem):
     def __init__(self, register_file_size: int, cache_size: str):
         super().__init__()
         self.wl_engine = WLEngine(
-            update_queue_size=64, register_file_size=register_file_size
+            update_queue_size=64,
+            register_file_size=register_file_size,
+            rd_per_cycle=4,
+            reduce_per_cycle=32,
+            wr_per_cycle=4,
+            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index d3dc381625..de3764a605 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -53,16 +53,11 @@ PushEngine::PushEngine(const Params& params):
 {
     destinationQueues.clear();
     for (int i = 0; i < params.port_out_ports_connection_count; ++i) {
-<<<<<<< HEAD
         outPorts.emplace_back(name() + ".out_ports" + std::to_string(i), this, i);
         destinationQueues.emplace_back();
         destinationQueues[i].clear();
         sourceAndValueMaps.emplace_back();
         sourceAndValueMaps[i].clear();
-=======
-        outPorts.emplace_back(
-                        name() + ".out_ports" + std::to_string(i), this, i);
->>>>>>> Fixing done, code style and conifg. Adding a stat.
     }
 }
 
@@ -84,9 +79,9 @@ PushEngine::init()
     localAddrRange = owner->getAddrRanges();
     for (int i = 0; i < outPorts.size(); i++){
         AddrRangeList range_list = outPorts[i].getAddrRanges();
-        assert(range_list.size() == 1);
-        AddrRange range = outPorts[i].getAddrRanges().front();
-        portAddrMap.insert(range, i);
+        for (auto range: range_list) {
+            portAddrMap.insert(range, i);
+        }
     }
 }
 
@@ -159,26 +154,7 @@ PushEngine::done()
 {
     bool empty_update_queues = true;
     for (int i = 0; i < outPorts.size(); i++) {
-<<<<<<< HEAD
         empty_update_queues &= destinationQueues[i].empty();
-=======
-        empty_update_queues &= updateQueues[outPorts[i].id()].empty();
-    }
-    return empty_update_queues && edgeQueue.empty() &&
-        (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
-}
-
-
-uint32_t
-PushEngine::propagate(uint32_t value, uint32_t weight)
-{
-    uint32_t update;
-    if (workload == "BFS")  {
-        update = value + 1;
-    }
-    else{
-        panic("The workload %s is not supported", workload);
->>>>>>> Fixing done, code style and conifg. Adding a stat.
     }
     return empty_update_queues && metaEdgeQueue.empty() &&
         (onTheFlyMemReqs == 0) && edgePointerQueue.empty();
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 09575365fe..4c81837856 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -47,9 +47,8 @@ RouterEngine::RouterEngine(const Params &params):
                         [this] { processNextInternalRequestEvent(); }, name()),
   nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
   nextExternalRequestEvent(
-                        [this] { processNextExternalRequestEvent(); }, name()),
-//   trafficStats(*this)
-  stats(*this)
+                        [this] { processNextExternalRequestEvent(); }, name())
+//   stats(*this)
 {
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
@@ -133,6 +132,7 @@ RouterEngine::init()
     for (int i = 0; i < gptReqPorts.size(); i++) {
         gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
     }
+    std::cout<<"gptReqPorts: "<<gptReqPorts.size()<<std::endl;
 }
 
 void
@@ -141,6 +141,7 @@ RouterEngine::startup()
     for (int i = 0; i < gpnReqPorts.size(); i++) {
         routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
     }
+    std::cout<<"gpnReqPorts: "<<gpnReqPorts.size()<<std::endl;
 }
 
 bool
@@ -300,7 +301,7 @@ RouterEngine::handleRequest(PortID portId, PacketPtr pkt)
         gptReqQueues[portId].push(pkt);
         accepted = true;
     } else {
-         DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] is full: %d.\n",
+         DPRINTF(RouterEngine, "%s: gptReqQueues[%lu] is full.\n",
                                                             __func__, portId);
         accepted = false;
     }
@@ -387,7 +388,7 @@ RouterEngine::processNextInternalRequestEvent()
                     < routerLatency) {
                     continue;
                 } 
-                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                // stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
                     __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
@@ -396,12 +397,13 @@ RouterEngine::processNextInternalRequestEvent()
                 internalLatency[gpnReqPorts[queue.first].id()] = curCycle();
             } 
             else {
-                stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
-                stats.internalTrafficHist[gpnReqPorts[queue.first].id()]->
-                                                sample(queue.second.size());
+                DPRINTF(RouterEngine, "%s: port id %d is blocked.\n",
+                    __func__, gpnReqPorts[queue.first].id());
+            //     stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
             }
         }
     }
+
     for (auto &queue: gpnRespQueues) {
         if (!queue.second.empty()) {
             none_empty_queue = true;
@@ -422,7 +424,7 @@ RouterEngine::processNextInternalRequestEvent()
             itr++)
     {
         if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
-            if ((itr->second + routerLatency) <  curCycle()) {
+            if ((itr->second + routerLatency) <=  curCycle()) {
                 next_schedule =  nextCycle();
                 break;
             } else {
@@ -433,6 +435,7 @@ RouterEngine::processNextInternalRequestEvent()
         } 
     }
 
+
     if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
         schedule(nextInternalRequestEvent, next_schedule);
     }
@@ -560,7 +563,7 @@ RouterEngine::processNextExternalRequestEvent()
                     < routerLatency) {
                     continue;
                 }
-                stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
+                // stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. "
                         "the size is: %d.\n", __func__,
@@ -571,9 +574,9 @@ RouterEngine::processNextExternalRequestEvent()
                 queue.second.pop();
                 externalLatency[gptReqPorts[queue.first].id()] = curCycle();
             }
-             else {
-                stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
-            }
+            // else {
+            //     stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
+            // }
         }
     }
 
@@ -598,7 +601,7 @@ RouterEngine::processNextExternalRequestEvent()
         itr != externalLatency.end(); itr++)
     {
         if (cyclesToTicks(itr->second + routerLatency) <  next_schedule) {
-            if ((itr->second + routerLatency) <  curCycle()) {
+            if ((itr->second + routerLatency) <=  curCycle()) {
                 next_schedule =  nextCycle();
                 break;
             } else {
@@ -657,38 +660,28 @@ RouterEngine::checkGPNRetryReq()
     }
 }
 
-RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
-    : statistics::Group(&_router),
-    router(_router),
-    ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
-             "Number of packets blocked between routers."),
-    ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
-             "Number of external packets blocked."),
-    ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
-             "Number of packet passed between routers."),
-    ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
-             "Number of external packets passed.")
-{
-    for (int i = 0; i < router.gpnReqPorts.size(); i++) {
-        internalTrafficHist.push_back(new statistics::Histogram(
-            this, "internalTrafficHist", 
-            statistics::units::Count::get()));
-    }
-}
-
-void
-RouterEngine::RouterEngineStat::regStats()
-{
-    using namespace statistics;
-
-    internalBlockedTraffic.init(router.gpnReqPorts.size());
-    externalBlockedTraffic.init(router.gptReqPorts.size());
-    internalAcceptedTraffic.init(router.gpnReqPorts.size());
-    externalAcceptedTraffic.init(router.gptReqPorts.size());
-
-    for (int i = 0; i < router.gpnReqPorts.size(); i++) {
-        internalTrafficHist[i]->init(10);
-    }
-}
+// RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
+//     : statistics::Group(&_router),
+//     router(_router),
+//     ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
+//              "Number of packets blocked between routers."),
+//     ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
+//              "Number of external packets blocked."),
+//     ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
+//              "Number of packet passed between routers."),
+//     ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
+//              "Number of external packets passed.")
+// {}
+
+// void
+// RouterEngine::RouterEngineStat::regStats()
+// {
+//     using namespace statistics;
+
+//     internalBlockedTraffic.init(router.gpnReqPorts.size());
+//     externalBlockedTraffic.init(router.gptReqPorts.size());
+//     internalAcceptedTraffic.init(router.gpnReqPorts.size());
+//     externalAcceptedTraffic.init(router.gptReqPorts.size());
+// }
 
 }// namespace gem5
\ No newline at end of file
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 661ace4a06..1e052f08d5 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -177,22 +177,20 @@ class RouterEngine : public ClockedObject
     EventFunctionWrapper nextExternalRequestEvent;
     void processNextExternalRequestEvent();
 
-    struct RouterEngineStat : public statistics::Group
-    {
-      RouterEngineStat(RouterEngine &push);
-
-      void regStats() override;
+    // struct RouterEngineStat : public statistics::Group
+    // {
+    //   RouterEngineStat(RouterEngine &push);
 
-      RouterEngine &router;
+    //   void regStats() override;
 
-      statistics::Vector internalBlockedTraffic;
-      statistics::Vector externalBlockedTraffic;
-      statistics::Vector internalAcceptedTraffic;
-      statistics::Vector externalAcceptedTraffic;
-      // std::vector<statistics::Histogram> internalTrafficHist;
-      std::vector<statistics::Histogram *> internalTrafficHist;
-    };
+    //   RouterEngine &router;
 
+    //   statistics::Vector internalBlockedTraffic;
+    //   statistics::Vector externalBlockedTraffic;
+    //   statistics::Vector internalAcceptedTraffic;
+    //   statistics::Vector externalAcceptedTraffic;
+    // };
+    // RouterEngineStat stats;
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
@@ -210,9 +208,6 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
-    void collateStats();
-    RouterEngineStat stats;
-
 };
 
 }

From b4a2911d772a738abbc1458503d54e4a9d42967d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 19 May 2023 16:57:50 -0700
Subject: [PATCH 264/287] Adding statistics to capture the traffic pattern
 between GPNs

---
 src/accl/graph/sega/router_engine.cc | 76 +++++++++++++++++-----------
 src/accl/graph/sega/router_engine.hh | 24 +++++----
 2 files changed, 60 insertions(+), 40 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 4c81837856..1e57cfb7de 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -47,13 +47,15 @@ RouterEngine::RouterEngine(const Params &params):
                         [this] { processNextInternalRequestEvent(); }, name()),
   nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
   nextExternalRequestEvent(
-                        [this] { processNextExternalRequestEvent(); }, name())
-//   stats(*this)
+                        [this] { processNextExternalRequestEvent(); }, name()),
+  stats(*this)
 {
 
     for (int i = 0; i < params.port_gpt_req_side_connection_count; ++i) {
         gptReqPorts.emplace_back(
                     name() + ".gpt_req_side" + std::to_string(i), this, i);
+        // m_newTraffic.emplace_back(new statistics::Histogram());
+        // m_newTraffic[i]->init(10);
     }
 
     for (int i = 0; i < params.port_gpt_resp_side_connection_count; ++i) {
@@ -135,6 +137,14 @@ RouterEngine::init()
     std::cout<<"gptReqPorts: "<<gptReqPorts.size()<<std::endl;
 }
 
+// void
+// RouterEngine::resetStats()
+// {
+//     for (int i = 0; i < gptReqPorts.size(); i++) {
+//         m_newTraffic[i]->reset();
+//     }
+// }
+
 void
 RouterEngine::startup()
 {
@@ -334,6 +344,7 @@ RouterEngine::processNextGPTGPNEvent()
                                 "gpnRespQueue[%d]. gpnRespQueue size is: %d\n",
                                 __func__, pkt->getAddr(), i,
                                 gpnRespQueues[gpnReqPorts[i].id()].size());
+                        // stats.internalTrafficHist[gpnReqPorts[i].id()]->sample(gpnRespQueues[gpnReqPorts[i].id()].size());
                         queue.second.pop();
                         DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n",
                                                 __func__, queue.second.size());
@@ -388,7 +399,7 @@ RouterEngine::processNextInternalRequestEvent()
                     < routerLatency) {
                     continue;
                 } 
-                // stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
                     __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
@@ -399,7 +410,7 @@ RouterEngine::processNextInternalRequestEvent()
             else {
                 DPRINTF(RouterEngine, "%s: port id %d is blocked.\n",
                     __func__, gpnReqPorts[queue.first].id());
-            //     stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
             }
         }
     }
@@ -563,7 +574,7 @@ RouterEngine::processNextExternalRequestEvent()
                     < routerLatency) {
                     continue;
                 }
-                // stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
+                stats.externalAcceptedTraffic[gptReqPorts[queue.first].id()]++;
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: gptRespQueues[%d] is not empty. "
                         "the size is: %d.\n", __func__,
@@ -574,9 +585,9 @@ RouterEngine::processNextExternalRequestEvent()
                 queue.second.pop();
                 externalLatency[gptReqPorts[queue.first].id()] = curCycle();
             }
-            // else {
-            //     stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
-            // }
+            else {
+                stats.externalBlockedTraffic[gptReqPorts[queue.first].id()]++;
+            }
         }
     }
 
@@ -660,28 +671,35 @@ RouterEngine::checkGPNRetryReq()
     }
 }
 
-// RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
-//     : statistics::Group(&_router),
-//     router(_router),
-//     ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
-//              "Number of packets blocked between routers."),
-//     ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
-//              "Number of external packets blocked."),
-//     ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
-//              "Number of packet passed between routers."),
-//     ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
-//              "Number of external packets passed.")
-// {}
+RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
+    : statistics::Group(&_router),
+    router(_router),
+    ADD_STAT(internalBlockedTraffic, statistics::units::Count::get(),
+             "Number of packets blocked between routers."),
+    ADD_STAT(externalBlockedTraffic, statistics::units::Count::get(),
+             "Number of external packets blocked."),
+    ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of packet passed between routers."),
+    ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
+             "Number of external packets passed.")
+{}
 
-// void
-// RouterEngine::RouterEngineStat::regStats()
-// {
-//     using namespace statistics;
+void
+RouterEngine::RouterEngineStat::regStats()
+{
+    using namespace statistics;
 
-//     internalBlockedTraffic.init(router.gpnReqPorts.size());
-//     externalBlockedTraffic.init(router.gptReqPorts.size());
-//     internalAcceptedTraffic.init(router.gpnReqPorts.size());
-//     externalAcceptedTraffic.init(router.gptReqPorts.size());
-// }
+    internalBlockedTraffic.init(router.gpnReqPorts.size());
+    externalBlockedTraffic.init(router.gptReqPorts.size());
+    internalAcceptedTraffic.init(router.gpnReqPorts.size());
+    externalAcceptedTraffic.init(router.gptReqPorts.size());
 
+    for (uint32_t i = 0; i < router.gpnReqPorts.size(); ++i) {
+        internalTrafficHist.push_back(new statistics::Histogram(this));
+        internalTrafficHist[i]
+            ->init(64)
+            .name(csprintf("internal_traffic_hist"))
+            .desc("");
+    }
+}
 }// namespace gem5
\ No newline at end of file
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 1e052f08d5..9bd44b6147 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -177,20 +177,22 @@ class RouterEngine : public ClockedObject
     EventFunctionWrapper nextExternalRequestEvent;
     void processNextExternalRequestEvent();
 
-    // struct RouterEngineStat : public statistics::Group
-    // {
-    //   RouterEngineStat(RouterEngine &push);
+    struct RouterEngineStat : public statistics::Group
+    {
+      RouterEngineStat(RouterEngine &push);
+
+      void regStats() override;
 
-    //   void regStats() override;
+      RouterEngine &router;
 
-    //   RouterEngine &router;
+      statistics::Vector internalBlockedTraffic;
+      statistics::Vector externalBlockedTraffic;
+      statistics::Vector internalAcceptedTraffic;
+      statistics::Vector externalAcceptedTraffic;
+      std::vector<statistics::Histogram *> internalTrafficHist;
+    };
+    RouterEngineStat stats;
 
-    //   statistics::Vector internalBlockedTraffic;
-    //   statistics::Vector externalBlockedTraffic;
-    //   statistics::Vector internalAcceptedTraffic;
-    //   statistics::Vector externalAcceptedTraffic;
-    // };
-    // RouterEngineStat stats;
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);

From b328de70ac66419b05c84f3fe2407d143bd7686d Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 19 May 2023 17:31:21 -0700
Subject: [PATCH 265/287] Adding vector of histogram statistic.

---
 src/accl/graph/sega/router_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 1e57cfb7de..bcd8479df0 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -344,7 +344,7 @@ RouterEngine::processNextGPTGPNEvent()
                                 "gpnRespQueue[%d]. gpnRespQueue size is: %d\n",
                                 __func__, pkt->getAddr(), i,
                                 gpnRespQueues[gpnReqPorts[i].id()].size());
-                        // stats.internalTrafficHist[gpnReqPorts[i].id()]->sample(gpnRespQueues[gpnReqPorts[i].id()].size());
+                        stats.internalTrafficHist[gpnReqPorts[i].id()]->sample(gpnRespQueues[gpnReqPorts[i].id()].size());
                         queue.second.pop();
                         DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n",
                                                 __func__, queue.second.size());

From 60ce2e6c16b4de632968c348445e3ba4647ad476 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 3 Apr 2023 13:00:36 -0700
Subject: [PATCH 266/287] Fixing = operator for UniqueFIFO.

---
 src/accl/graph/base/data_structs.hh    | 13 +++++-
 src/accl/graph/sega/coalesce_engine.cc | 38 +++++++++++++--
 src/accl/graph/sega/coalesce_engine.hh |  5 ++
 src/accl/graph/sega/wl_engine.cc       | 64 ++++++++++++++------------
 4 files changed, 85 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index a391e0794d..60391b3a7c 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -34,6 +34,7 @@
 
 #include <algorithm>
 #include <cassert>
+#include <cstring>
 #include <deque>
 
 namespace gem5
@@ -166,6 +167,11 @@ class UniqueFIFO
         container.clear();
     }
 
+    ~UniqueFIFO() {
+        delete [] added;
+        delete [] deleted;
+    }
+
     void fix_front() {
         while(true) {
             T elem = container.front();
@@ -234,10 +240,13 @@ class UniqueFIFO
     }
 
     void operator=(const UniqueFIFO<T>& rhs) {
+        cap = rhs.cap;
         pop = rhs.pop;
         container = rhs.container;
-        added = rhs.added;
-        deleted = rhs.deleted;
+        added = (int*) new int [cap];
+        deleted = (int*) new int [cap];
+        std::memcpy(added, rhs.added, cap * sizeof(int));
+        std::memcpy(deleted, rhs.deleted, cap * sizeof(int));
     }
 };
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index fcdd26ceb4..dcc7feb3dd 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -45,9 +45,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     BaseMemoryEngine(params), mode(ProcessingMode::NOT_SET), lastAtomAddr(0),
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
-    onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0),
-    pendingPullLimit(params.pending_pull_limit),
+    lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
+    pullsReceived(0), pullsScheduled(0), pendingPullLimit(params.pending_pull_limit),
     pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
     nextMemoryEvent([this] {
@@ -74,6 +73,7 @@ CoalesceEngine::CoalesceEngine(const Params &params):
 
     activeBuffer.clear();
     postPushWBQueue.clear();
+    blocksTouchedThisTick.clear();
 }
 
 void
@@ -247,6 +247,10 @@ CoalesceEngine::recvWLRead(Addr addr)
     assert(aligned_addr % peerMemoryAtomSize == 0);
     int block_index = getBlockIndex(aligned_addr);
     assert(block_index < numLines);
+    if (lastReadTick < curTick()) {
+        blocksTouchedThisTick.clear();
+        lastReadTick = curTick();
+    }
     int wl_offset = (addr - aligned_addr) / sizeof(WorkListItem);
     assert(wl_offset < numElementsPerLine);
     DPRINTF(CoalesceEngine,  "%s: Received a read request for addr: %lu. "
@@ -289,9 +293,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
 
+        blocksTouchedThisTick.insert(block_index);
         if (!nextResponseEvent.scheduled()) {
             schedule(nextResponseEvent, nextCycle());
         }
+
         stats.numVertexReads++;
         return ReadReturnStatus::ACCEPT;
     } else if ((cacheBlocks[block_index].addr == aligned_addr) &&
@@ -310,6 +316,8 @@ CoalesceEngine::recvWLRead(Addr addr)
                 "for cacheBlocks[%d].\n", __func__, addr, block_index);
         DPRINTF(CacheBlockState, "%s: cacheBlocks[%d]: %s.\n", __func__,
                     block_index, cacheBlocks[block_index].to_string());
+        blocksTouchedThisTick.insert(block_index);
+
         stats.numVertexReads++;
         return ReadReturnStatus::ACCEPT;
     } else {
@@ -317,6 +325,11 @@ CoalesceEngine::recvWLRead(Addr addr)
         assert(cacheBlocks[block_index].addr != aligned_addr);
         DPRINTF(CoalesceEngine,  "%s: Addr: %lu is a miss.\n", __func__, addr);
         stats.readMisses++;
+        if (blocksTouchedThisTick.find(block_index) != blocksTouchedThisTick.end()) {
+            DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] has already been "
+                            "accessed this tick.\n", __func__, block_index);
+            return ReadReturnStatus::REJECT_ROLL;
+        }
         if (cacheBlocks[block_index].state != CacheState::INVALID) {
             // conflict miss
             DPRINTF(CoalesceEngine, "%s: Addr: %lu has conflict with "
@@ -324,6 +337,8 @@ CoalesceEngine::recvWLRead(Addr addr)
             cacheBlocks[block_index].hasConflict = true;
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
                 if (cacheBlocks[block_index].dirty) {
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is dirty.\n",
+                                                        __func__, block_index);
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     cacheBlocks[block_index].lastChangedTick = curTick();
                     memoryFunctionQueue.emplace_back(
@@ -334,10 +349,14 @@ CoalesceEngine::recvWLRead(Addr addr)
                         (!nextMemoryEvent.scheduled())) {
                         schedule(nextMemoryEvent, nextCycle());
                     }
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is now "
+                            "pending write back.\n", __func__, block_index);
                 } else {
                     // NOTE: The cache block could still be active but
                     // not dirty. If active we only have to active tracking
                     // but can throw the data away.
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is not dirty.\n",
+                                                        __func__, block_index);
                     bool atom_active_now = false;
                     bool atom_active_future = false;
                     for (int index = 0; index < numElementsPerLine; index++) {
@@ -345,12 +364,16 @@ CoalesceEngine::recvWLRead(Addr addr)
                         atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                     }
                     if (atom_active_now) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
+                                                        __func__, block_index);
                         currentActiveCacheBlocks.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                         stats.currentFrontierSize.sample(currentDirectory->workCount());
                         stats.currentBlockActiveCount.sample(count);
                     }
                     if (atom_active_future) {
+                        DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
+                                                        __func__, block_index);
                         futureActiveCacheBlocks.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                         stats.futureFrontierSize.sample(futureDirectory->workCount());
@@ -360,9 +383,13 @@ CoalesceEngine::recvWLRead(Addr addr)
                     // NOTE: Above line where we set hasConflict to true
                     // does not matter anymore since we reset the cache line.
                     cacheBlocks[block_index].reset();
+                    DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is reset.\n",
+                                                        __func__, block_index);
                 }
+                blocksTouchedThisTick.insert(block_index);
                 return ReadReturnStatus::REJECT_NO_ROLL;
             } else {
+                blocksTouchedThisTick.insert(block_index);
                 stats.numConflicts++;
                 return ReadReturnStatus::REJECT_ROLL;
             }
@@ -386,6 +413,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                 (!nextMemoryEvent.scheduled())) {
                 schedule(nextMemoryEvent, nextCycle());
             }
+            blocksTouchedThisTick.insert(block_index);
             return ReadReturnStatus::ACCEPT;
         }
     }
@@ -497,7 +525,7 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
                             responseQueue.size());
                 DPRINTF(CoalesceEngine, "%s: Added (addr: %lu, wl: %s) "
                             "to responseQueue. responseQueue.size = %d.\n",
-                            __func__, addr,
+                            __func__, miss_addr,
                             graphWorkload->printWorkListItem(
                                 cacheBlocks[block_index].items[wl_offset]),
                             responseQueue.size());
@@ -798,6 +826,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
+                assert(!cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
@@ -829,6 +858,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
+                assert(cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index b6eec725f9..f01475118a 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -29,6 +29,8 @@
 #ifndef __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 #define __ACCL_GRAPH_SEGA_COALESCE_ENGINE_HH__
 
+#include <unordered_set>
+
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
 #include "accl/graph/sega/base_memory_engine.hh"
@@ -107,6 +109,9 @@ class CoalesceEngine : public BaseMemoryEngine
     int numElementsPerLine;
     Block* cacheBlocks;
 
+    Tick lastReadTick;
+    std::unordered_set<int> blocksTouchedThisTick;
+
     int onTheFlyReqs;
     std::unordered_map<int, std::vector<Addr>> MSHR;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 69e874c0d6..0c96689a5a 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -195,23 +195,24 @@ void
 WLEngine::processNextReadEvent()
 {
     int num_reads = 0;
+    int num_popped = 0;
     int num_tries = 0;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> tempQueue;
-
+    std::deque<std::tuple<Addr, uint32_t, Tick>> temp_queue;
     for (int i = 0; i < maxUpdatesProcessed; i++) {
         if (updateQueue.empty()) {
             break;
         }
-        tempQueue.push_back(updateQueue.front());
+        temp_queue.push_back(updateQueue.front());
         updateQueue.pop_front();
     }
 
+    int max_visits = temp_queue.size();
+
     while (true) {
-        num_tries += 1;
         Addr update_addr;
         uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = tempQueue.front();
+        std::tie(update_addr, update_value, enter_tick) = temp_queue.front();
 
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
@@ -235,48 +236,54 @@ WLEngine::processNextReadEvent()
                             "to registerFile. registerFile.size = %d, "
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
-                    tempQueue.pop_front();
+                    temp_queue.pop_front();
                     num_reads++;
+                    num_popped++;
                     stats.updateQueueLatency.sample(
                             (curTick() - enter_tick) * 1e9 / getClockFrequency());
                     DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
-                                update_value, tempQueue.size(), updateQueueSize);
+                                update_value, temp_queue.size(), updateQueueSize);
                     DPRINTF(WLEngine, "%s: Popped (addr: %lu, value: %u) "
                                 "from updateQueue. updateQueue.size = %d. "
                                 "updateQueueSize = %d.\n", __func__, update_addr,
                                 update_value, updateQueue.size(), updateQueueSize);
                     vertexReadTime[update_addr] = curTick();
-                    checkRetryReq();
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
-                        tempQueue.pop_front();
-                        tempQueue.emplace_back(
-                                            update_addr, update_value, enter_tick);
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
                         stats.numUpdateRolls++;
                     } else {
-                        DPRINTF(WLEngine, "%s: Received a reject from cache. "
-                                        "Not rolling the update.\n", __func__);
+                        temp_queue.pop_front();
+                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        DPRINTF(WLEngine, "%s: Received a reject with no roll "
+                        "from cache. Rolling the update anyway.\n", __func__);
                     }
                 }
             } else {
                 DPRINTF(WLEngine, "%s: There are no free registers "
                         "available in the registerFile.\n", __func__);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, update_value, enter_tick);
                 stats.registerShortage++;
             }
         } else {
+            DPRINTF(WLEngine,  "%s: A register has already been allocated for "
+                "addr: %lu in registerFile. registerFile[%lu] = %u.\n", __func__,
+                update_addr, update_addr, std::get<1>(registerFile[update_addr]));
             RegisterState state = std::get<0>(registerFile[update_addr]);
             if (state == RegisterState::PENDING_WRITE) {
                 // NOTE: If it's pending write, let it be written.
-                tempQueue.pop_front();
-                tempQueue.emplace_back(update_addr, update_value, enter_tick);
+                DPRINTF(WLEngine, "%s: Respective register for addr: "
+                        "%lu is pending a write to the cache. Rolling "
+                        "the update.\n", __func__, update_addr);
+                temp_queue.pop_front();
+                temp_queue.emplace_back(update_addr, update_value, enter_tick);
             } else {
-                DPRINTF(WLEngine,  "%s: A register has already been allocated for "
-                            "addr: %lu in registerFile. registerFile[%lu] = %u.\n",
-                        __func__, update_addr, update_addr, std::get<1>(registerFile[update_addr]));
                 uint32_t curr_value = std::get<1>(registerFile[update_addr]);
                 uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
                 registerFile[update_addr] = std::make_tuple(state, new_value);
@@ -284,7 +291,8 @@ WLEngine::processNextReadEvent()
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
                 stats.registerFileCoalesce++;
-                tempQueue.pop_front();
+                temp_queue.pop_front();
+                num_popped++;
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
                 DPRINTF(SEGAStructureSize, "%s: Popped (addr: %lu, value: %u) "
@@ -295,30 +303,28 @@ WLEngine::processNextReadEvent()
                             "from updateQueue. updateQueue.size = %d. "
                             "updateQueueSize = %d.\n", __func__, update_addr,
                             update_value, updateQueue.size(), updateQueueSize);
-                checkRetryReq();
             }
         }
 
+        num_tries++;
         if (num_reads >= maxReadsPerCycle) {
             stats.numReadPortShortage++;
             break;
         }
-        if (num_tries > maxUpdatesProcessed) {
+        if (num_tries >= max_visits) {
             break;
         }
-
-        if (tempQueue.empty()) {
+        if (temp_queue.empty()) {
             break;
         }
     }
 
-    for (int i = 0; i < tempQueue.size(); i++){
-        updateQueue.push_front(tempQueue.back());
-        tempQueue.pop_back();
+    while (!temp_queue.empty()) {
+        updateQueue.push_front(temp_queue.back());
+        temp_queue.pop_back();
     }
-
-    if (!toReduce.empty() && !nextReduceEvent.scheduled()) {
-        schedule(nextReduceEvent, nextCycle());
+    if (num_popped > 0) {
+        checkRetryReq();
     }
     if (!updateQueue.empty() && !nextReadEvent.scheduled()) {
         schedule(nextReadEvent, nextCycle());

From ba067c725221260a42762529f423d5ef0d2498bb Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Apr 2023 22:21:26 -0700
Subject: [PATCH 267/287] Fixing a typo.

---
 src/accl/graph/sega/wl_engine.cc | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0c96689a5a..4f23d65d32 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -345,7 +345,7 @@ WLEngine::handleIncomingWL(Addr addr, WorkListItem wl)
                 "workListFile. workListFile.size = %d.\n", __func__, addr,
                 graphWorkload->printWorkListItem(wl), workListFile.size());
 
-    uint32_t value = std::get<0>(registerFile[addr]);
+    uint32_t value = std::get<1>(registerFile[addr]);
     registerFile[addr] = std::make_tuple(RegisterState::PENDING_REDUCE, value);
     toReduce.push_back(addr);
 

From 722103c4a7a621e733a45bf514088bd3b0a37943 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Apr 2023 14:05:33 -0700
Subject: [PATCH 268/287] Updating wl_engine stats. Adding colaescing to update
 queue.

---
 src/accl/graph/sega/coalesce_engine.cc |  1 -
 src/accl/graph/sega/wl_engine.cc       | 97 ++++++++++++++++----------
 src/accl/graph/sega/wl_engine.hh       | 12 ++--
 3 files changed, 69 insertions(+), 41 deletions(-)

diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index dcc7feb3dd..42ae604833 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -858,7 +858,6 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             bool atom_active_future = false;
             for (int index = 0; index < numElementsPerLine; index++)
             {
-                assert(cacheBlocks[block_index].items[index].activeNow);
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 4f23d65d32..5a4a960635 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -166,22 +166,38 @@ WLEngine::done()
 bool
 WLEngine::handleIncomingUpdate(PacketPtr pkt)
 {
-    assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
-    if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
-        return false;
+    Addr update_addr = pkt->getAddr();
+    uint32_t update_value = pkt->getLE<uint32_t>();
+
+    if (valueMap.find(update_addr) != valueMap.end()) {
+        assert((updateQueueSize == 0) ||
+                (updateQueue.size() <= updateQueueSize));
+        DPRINTF(WLEngine, "%s: Found an already queued update to %u. ",
+                            "Current value is: %u.\n", __func__,
+                            update_addr, valueMap[update_addr]);
+        valueMap[update_addr] =
+                graphWorkload->reduce(update_value, valueMap[update_addr]);
+        stats.numIncomingUpdates++;
+        stats.updateQueueCoalescions++;
+    } else {
+        assert((updateQueueSize == 0) || (updateQueue.size() <= updateQueueSize));
+        if ((updateQueueSize != 0) && (updateQueue.size() == updateQueueSize)) {
+            return false;
+        } else {
+            updateQueue.emplace_back(update_addr, curTick());
+            valueMap[update_addr] = update_value;
+            stats.numIncomingUpdates++;
+            DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+            DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
+                        "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
+                        __func__, update_addr, update_value,
+                        updateQueue.size(), updateQueueSize);
+        }
     }
 
-    updateQueue.emplace_back(pkt->getAddr(), pkt->getLE<uint32_t>(), curTick());
-    stats.numberIncomingUpdaes++;
-    DPRINTF(SEGAStructureSize, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-    DPRINTF(WLEngine, "%s: Emplaced (addr: %lu, value: %u) in the "
-                "updateQueue. updateQueue.size = %d, updateQueueSize = %d.\n",
-                __func__, pkt->getAddr(), pkt->getLE<uint32_t>(),
-                updateQueue.size(), updateQueueSize);
-
     // delete the packet since it's not needed anymore.
     delete pkt;
 
@@ -194,10 +210,7 @@ WLEngine::handleIncomingUpdate(PacketPtr pkt)
 void
 WLEngine::processNextReadEvent()
 {
-    int num_reads = 0;
-    int num_popped = 0;
-    int num_tries = 0;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> temp_queue;
+    std::deque<std::tuple<Addr, Tick>> temp_queue;
     for (int i = 0; i < maxUpdatesProcessed; i++) {
         if (updateQueue.empty()) {
             break;
@@ -206,17 +219,18 @@ WLEngine::processNextReadEvent()
         updateQueue.pop_front();
     }
 
+    int num_reads = 0;
+    int num_popped = 0;
+    int num_tries = 0;
     int max_visits = temp_queue.size();
-
     while (true) {
         Addr update_addr;
-        uint32_t update_value;
         Tick enter_tick;
-        std::tie(update_addr, update_value, enter_tick) = temp_queue.front();
+        std::tie(update_addr, enter_tick) = temp_queue.front();
 
+        uint32_t update_value = valueMap[update_addr];
         DPRINTF(WLEngine,  "%s: Looking at the front of the updateQueue. "
             "(addr: %lu, value: %u).\n", __func__, update_addr, update_value);
-
         if ((registerFile.find(update_addr) == registerFile.end())) {
             DPRINTF(WLEngine,  "%s: No register already allocated for addr: %lu "
                                 "in registerFile.\n", __func__, update_addr);
@@ -237,6 +251,7 @@ WLEngine::processNextReadEvent()
                             "registerFileSize = %d.\n", __func__, update_addr,
                             update_value, registerFile.size(), registerFileSize);
                     temp_queue.pop_front();
+                    valueMap.erase(update_addr);
                     num_reads++;
                     num_popped++;
                     stats.updateQueueLatency.sample(
@@ -253,13 +268,13 @@ WLEngine::processNextReadEvent()
                 } else {
                     if (read_status == ReadReturnStatus::REJECT_ROLL) {
                         temp_queue.pop_front();
-                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        temp_queue.emplace_back(update_addr, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject from cache. "
                                             "Rolling the update.\n", __func__);
                         stats.numUpdateRolls++;
                     } else {
                         temp_queue.pop_front();
-                        temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                        temp_queue.emplace_back(update_addr, enter_tick);
                         DPRINTF(WLEngine, "%s: Received a reject with no roll "
                         "from cache. Rolling the update anyway.\n", __func__);
                     }
@@ -268,7 +283,7 @@ WLEngine::processNextReadEvent()
                 DPRINTF(WLEngine, "%s: There are no free registers "
                         "available in the registerFile.\n", __func__);
                 temp_queue.pop_front();
-                temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                temp_queue.emplace_back(update_addr, enter_tick);
                 stats.registerShortage++;
             }
         } else {
@@ -282,7 +297,7 @@ WLEngine::processNextReadEvent()
                         "%lu is pending a write to the cache. Rolling "
                         "the update.\n", __func__, update_addr);
                 temp_queue.pop_front();
-                temp_queue.emplace_back(update_addr, update_value, enter_tick);
+                temp_queue.emplace_back(update_addr, enter_tick);
             } else {
                 uint32_t curr_value = std::get<1>(registerFile[update_addr]);
                 uint32_t new_value = graphWorkload->reduce(update_value, curr_value);
@@ -290,8 +305,9 @@ WLEngine::processNextReadEvent()
                 DPRINTF(WLEngine,  "%s: Reduced the update_value: %u with the entry in"
                             " registerFile. registerFile[%lu] = %u.\n", __func__,
                             update_value, update_addr, std::get<1>(registerFile[update_addr]));
-                stats.registerFileCoalesce++;
+                stats.registerFileCoalescions++;
                 temp_queue.pop_front();
+                valueMap.erase(update_addr);
                 num_popped++;
                 stats.updateQueueLatency.sample(
                                 (curTick() - enter_tick) * 1e9 / getClockFrequency());
@@ -308,7 +324,9 @@ WLEngine::processNextReadEvent()
 
         num_tries++;
         if (num_reads >= maxReadsPerCycle) {
-            stats.numReadPortShortage++;
+            if (!temp_queue.empty()) {
+                stats.numReadPortShortage++;
+            }
             break;
         }
         if (num_tries >= max_visits) {
@@ -370,11 +388,14 @@ WLEngine::processNextReduceEvent()
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
         registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
         num_reduces++;
+        stats.numReductions++;
         toReduce.pop_front();
         toWrite.push_back(addr);
 
         if (num_reduces >= maxReducesPerCycle) {
-            // TODO: Add stat to count reducer shortage;
+            if (!toReduce.empty()) {
+                stats.numReducerShortage++;
+            }
             break;
         }
         if (toReduce.empty()) {
@@ -404,7 +425,9 @@ WLEngine::processNextWriteEvent()
         toWrite.pop_front();
         num_writes++;
         if (num_writes >= maxWritesPerCycle) {
-            stats.numWritePortShortage++;
+            if (!toWrite.empty()) {
+                stats.numWritePortShortage++;
+            }
             break;
         }
         if (toWrite.empty()) {
@@ -432,10 +455,8 @@ WLEngine::processNextDoneSignalEvent()
 WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
     : statistics::Group(&_wl),
     wl(_wl),
-    ADD_STAT(numReduce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
-    ADD_STAT(registerFileCoalesce, statistics::units::Count::get(),
-             "Number of memory blocks read for vertecies"),
+    ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
+             "Number of coalescions in the update queues."),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
              "Number of times updates were "
              "stalled because of register shortage"),
@@ -444,9 +465,15 @@ WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
              "to the back of the update queue due to cache reject."),
     ADD_STAT(numReadPortShortage, statistics::units::Count::get(),
              "Number of times limited by read per cycle."),
+    ADD_STAT(registerFileCoalescions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReductions, statistics::units::Count::get(),
+             "Number of memory blocks read for vertecies"),
+    ADD_STAT(numReducerShortage, statistics::units::Count::get(),
+             "Number of times limited by number of reducers."),
     ADD_STAT(numWritePortShortage, statistics::units::Count::get(),
              "Number of times limited by write per cycle."),
-    ADD_STAT(numberIncomingUpdaes, statistics::units::Count::get(),
+    ADD_STAT(numIncomingUpdates, statistics::units::Count::get(),
               "Number of inocoming updates for each GPT."),
     ADD_STAT(vertexReadLatency, statistics::units::Second::get(),
              "Histogram of the latency of reading a vertex (ns)."),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 8f55ecadd4..bb8e82f501 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -77,7 +77,8 @@ class WLEngine : public BaseReduceEngine
     std::vector<RespPort> inPorts;
 
     int updateQueueSize;
-    std::deque<std::tuple<Addr, uint32_t, Tick>> updateQueue;
+    std::deque<std::tuple<Addr, Tick>> updateQueue;
+    std::unordered_map<Addr, uint32_t> valueMap;
 
     int maxReadsPerCycle;
     int maxReducesPerCycle;
@@ -112,14 +113,15 @@ class WLEngine : public BaseReduceEngine
       void regStats() override;
 
       WLEngine &wl;
-
-      statistics::Scalar numReduce;
-      statistics::Scalar registerFileCoalesce;
+      statistics::Scalar updateQueueCoalescions;
       statistics::Scalar registerShortage;
       statistics::Scalar numUpdateRolls;
       statistics::Scalar numReadPortShortage;
+      statistics::Scalar registerFileCoalescions;
+      statistics::Scalar numReductions;
+      statistics::Scalar numReducerShortage;
       statistics::Scalar numWritePortShortage;
-      statistics::Scalar numberIncomingUpdaes;
+      statistics::Scalar numIncomingUpdates;
 
       statistics::Histogram vertexReadLatency;
       statistics::Histogram updateQueueLatency;

From c1219d9d9945b7a33602d1d7014518bf05f95d40 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Fri, 7 Apr 2023 08:56:37 -0700
Subject: [PATCH 269/287] Adding number of transitions.

---
 src/accl/graph/sega/CoalesceEngine.py  |   1 +
 src/accl/graph/sega/coalesce_engine.cc | 209 ++++++++++++++-----------
 src/accl/graph/sega/coalesce_engine.hh |  21 +--
 3 files changed, 128 insertions(+), 103 deletions(-)

diff --git a/src/accl/graph/sega/CoalesceEngine.py b/src/accl/graph/sega/CoalesceEngine.py
index 25f8a1c58b..bb45802c1d 100644
--- a/src/accl/graph/sega/CoalesceEngine.py
+++ b/src/accl/graph/sega/CoalesceEngine.py
@@ -47,3 +47,4 @@ class CoalesceEngine(BaseMemoryEngine):
                                 "apply process for applications that require "
                                 "the apply process to happen exactly before "
                                 "pushing the edgePointer to the PushEngine.")
+    transitions_per_cycle = Param.Int("Max number of transitions in a cycle")
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 42ae604833..a2653952e0 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -46,9 +46,10 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     numLines((int) (params.cache_size / peerMemoryAtomSize)),
     numElementsPerLine((int) (peerMemoryAtomSize / sizeof(WorkListItem))),
     lastReadTick(0), onTheFlyReqs(0), maxRespPerCycle(params.max_resp_per_cycle),
-    pullsReceived(0), pullsScheduled(0), pendingPullLimit(params.pending_pull_limit),
+    numReceivedPulls(0), numScheduledPulls(0), pendingPullLimit(params.pending_pull_limit),
     pendingPullReads(0), activeBufferSize(params.active_buffer_size),
     postPushWBQueueSize(params.post_push_wb_queue_size),
+    transitionsPerCycle(params.transitions_per_cycle),
     nextMemoryEvent([this] {
         processNextMemoryEvent();
         }, name() + ".nextMemoryEvent"),
@@ -68,8 +69,8 @@ CoalesceEngine::CoalesceEngine(const Params &params):
     for (int i = 0; i < numLines; i++) {
         cacheBlocks[i] = Block(numElementsPerLine);
     }
-    currentActiveCacheBlocks = UniqueFIFO<int>(numLines);
-    futureActiveCacheBlocks = UniqueFIFO<int>(numLines);
+    numActiveBlocksNow = UniqueFIFO<int>(numLines);
+    numActiveBlocksNext = UniqueFIFO<int>(numLines);
 
     activeBuffer.clear();
     postPushWBQueue.clear();
@@ -142,10 +143,10 @@ CoalesceEngine::postConsumeProcess()
                 }
             }
             if (!atom_active_future_before && atom_active_future_after) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
             if (atom_active_future_before && !atom_active_future_after) {
-                futureActiveCacheBlocks.erase(block_index);
+                numActiveBlocksNext.erase(block_index);
             }
         } else {
             WorkListItem items[numElementsPerLine];
@@ -199,35 +200,35 @@ void
 CoalesceEngine::swapDirectories()
 {
     assert(currentDirectory->empty());
-    assert(currentActiveCacheBlocks.empty());
+    assert(numActiveBlocksNow.empty());
     // assert currentDirectory is empty
     WorkDirectory* temp = currentDirectory;
     currentDirectory = futureDirectory;
     futureDirectory = temp;
 
-    currentActiveCacheBlocks.clear();
-    currentActiveCacheBlocks = futureActiveCacheBlocks;
-    futureActiveCacheBlocks.clear();
+    numActiveBlocksNow.clear();
+    numActiveBlocksNow = numActiveBlocksNext;
+    numActiveBlocksNext.clear();
 }
 
 bool
 CoalesceEngine::done()
 {
-    return memoryFunctionQueue.empty() && currentActiveCacheBlocks.empty() &&
+    return memAccBuffer.empty() && numActiveBlocksNow.empty() &&
         activeBuffer.empty() && currentDirectory->empty() && (onTheFlyReqs == 0);
 }
 
 bool
 CoalesceEngine::enoughSpace()
 {
-    return (activeBuffer.size() + pendingPullReads + pullsScheduled) < activeBufferSize;
+    return (activeBuffer.size() + pendingPullReads + numScheduledPulls) < activeBufferSize;
 }
 
 bool
 CoalesceEngine::pullCondition()
 {
     bool enough_space = enoughSpace();
-    bool schedule_limit = pullsScheduled < pendingPullLimit;
+    bool schedule_limit = numScheduledPulls < pendingPullLimit;
     return enough_space && schedule_limit;
 }
 
@@ -341,7 +342,7 @@ CoalesceEngine::recvWLRead(Addr addr)
                                                         __func__, block_index);
                     cacheBlocks[block_index].state = CacheState::PENDING_WB;
                     cacheBlocks[block_index].lastChangedTick = curTick();
-                    memoryFunctionQueue.emplace_back(
+                    memAccBuffer.emplace_back(
                         [this] (int block_index, Tick schedule_tick) {
                             processNextWriteBack(block_index, schedule_tick);
                         }, block_index, curTick());
@@ -366,18 +367,18 @@ CoalesceEngine::recvWLRead(Addr addr)
                     if (atom_active_now) {
                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active now.\n",
                                                         __func__, block_index);
-                        currentActiveCacheBlocks.erase(block_index);
+                        numActiveBlocksNow.erase(block_index);
                         int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                         stats.currentFrontierSize.sample(currentDirectory->workCount());
-                        stats.currentBlockActiveCount.sample(count);
+                        stats.countActiveBlocksNow.sample(count);
                     }
                     if (atom_active_future) {
                         DPRINTF(CoalesceEngine, "%s: cacheBlocks[%d] is active next.\n",
                                                         __func__, block_index);
-                        futureActiveCacheBlocks.erase(block_index);
+                        numActiveBlocksNext.erase(block_index);
                         int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                         stats.futureFrontierSize.sample(futureDirectory->workCount());
-                        stats.futureBlockActiveCount.sample(count);
+                        stats.countActiveBlocksNext.sample(count);
                     }
                     // NOTE: Bring the cache line to invalid state.
                     // NOTE: Above line where we set hasConflict to true
@@ -405,7 +406,7 @@ CoalesceEngine::recvWLRead(Addr addr)
             cacheBlocks[block_index].lastChangedTick = curTick();
 
             MSHR[block_index].push_back(addr);
-            memoryFunctionQueue.emplace_back(
+            memAccBuffer.emplace_back(
                 [this] (int block_index, Tick schedule_tick) {
                     processNextRead(block_index, schedule_tick);
                 }, block_index, curTick());
@@ -492,15 +493,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             }
             if (atom_active_now) {
                 int count = currentDirectory->deactivate(addr);
-                currentActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNow.push_back(block_index);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
             }
             if (atom_active_future) {
                 int count = futureDirectory->deactivate(addr);
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
                 stats.futureFrontierSize.sample(futureDirectory->workCount());
-                stats.futureBlockActiveCount.sample(count);
+                stats.countActiveBlocksNext.sample(count);
             }
 
             assert(MSHR.find(block_index) != MSHR.end());
@@ -560,11 +561,11 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             if (atom_active_now) {
                 int count = currentDirectory->deactivate(addr);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
                 if (atom_active_future) {
                     int count = futureDirectory->deactivate(addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
@@ -573,15 +574,15 @@ CoalesceEngine::handleMemResp(PacketPtr pkt)
             }
 
             if (pullCondition()) {
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
                         processNextVertexPull(ignore, schedule_tick);
-                    }, 0, curTick());
+                    }, -1, curTick());
                 if ((!nextMemoryEvent.pending()) &&
                     (!nextMemoryEvent.scheduled())) {
                     schedule(nextMemoryEvent, nextCycle());
                 }
-                pullsScheduled++;
+                numScheduledPulls++;
             }
         }
         delete purpose;
@@ -681,8 +682,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     cacheBlocks[block_index].items[wl_offset] = wl;
     if (mode == ProcessingMode::ASYNCHRONOUS) {
         cacheBlocks[block_index].items[wl_offset].activeNow |= active;
-        if (active && (!currentActiveCacheBlocks.find(block_index))) {
-            currentActiveCacheBlocks.push_back(block_index);
+        if (active && (!numActiveBlocksNow.find(block_index))) {
+            numActiveBlocksNow.push_back(block_index);
             if (!owner->running()) {
                 owner->start();
             }
@@ -690,8 +691,8 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
     }
     if (mode == ProcessingMode::BULK_SYNCHRONOUS) {
         cacheBlocks[block_index].items[wl_offset].activeFuture |= active;
-        if (active && (!futureActiveCacheBlocks.find(block_index))) {
-            futureActiveCacheBlocks.push_back(block_index);
+        if (active && (!numActiveBlocksNext.find(block_index))) {
+            numActiveBlocksNext.push_back(block_index);
         }
     }
 
@@ -709,7 +710,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
             if (cacheBlocks[block_index].dirty) {
                 cacheBlocks[block_index].state = CacheState::PENDING_WB;
                 cacheBlocks[block_index].lastChangedTick = curTick();
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int block_index, Tick schedule_tick) {
                         processNextWriteBack(block_index, schedule_tick);
                     }, block_index, curTick());
@@ -725,16 +726,16 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
                     atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
                 }
                 if (atom_active_now) {
-                    currentActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNow.erase(block_index);
                     int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                     stats.currentFrontierSize.sample(currentDirectory->workCount());
-                    stats.currentBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNow.sample(count);
                 }
                 if (atom_active_future) {
-                    futureActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNext.erase(block_index);
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 cacheBlocks[block_index].reset();
             }
@@ -756,32 +757,52 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 void
 CoalesceEngine::processNextMemoryEvent()
 {
-    if (memPort.blocked()) {
-        stats.numMemoryBlocks++;
-        nextMemoryEvent.sleep();
-        return;
+    int num_transitions = 0;
+    std::unordered_set<int> transitions;
+    FunctionDeque temp_deque;
+    temp_deque.clear();
+
+    while (true) {
+        if (memPort.blocked()) {
+            while (!temp_deque.empty()) {
+                memAccBuffer.push_front(temp_deque.back());
+                temp_deque.pop_back();
+            }
+            stats.numMemoryBlocks++;
+            nextMemoryEvent.sleep();
+            return;
+        }
+        DPRINTF(CoalesceEngine, "%s: Processing another "
+                            "memory function.\n", __func__);
+        std::function<void(int, Tick)> function;
+        int input;
+        Tick tick;
+        std::tie(function, input, tick) = memAccBuffer.front();
+        if ((transitions.find(input) == transitions.end()) || (input == -1)) {
+            function(input, tick);
+            memAccBuffer.pop_front();
+            transitions.insert(input);
+            stats.memAccBufferLat.sample((curTick() - tick) * 1e9 / getClockFrequency());
+            DPRINTF(CoalesceEngine, "%s: Popped a function from memAccBuffer. "
+                    "memAccBuffer.size = %d.\n", __func__, memAccBuffer.size());
+            num_transitions++;
+        } else {
+            temp_deque.emplace_back(function, input, tick);
+            memAccBuffer.pop_front();
+        }
+        if ((num_transitions >= transitionsPerCycle) || memAccBuffer.empty()) {
+            break;
+        }
     }
 
-    DPRINTF(CoalesceEngine, "%s: Processing another "
-                        "memory function.\n", __func__);
-    std::function<void(int, Tick)> next_memory_function;
-    int next_memory_function_input;
-    Tick next_memory_function_tick;
-    std::tie(
-        next_memory_function,
-        next_memory_function_input,
-        next_memory_function_tick) = memoryFunctionQueue.front();
-    next_memory_function(next_memory_function_input, next_memory_function_tick);
-    memoryFunctionQueue.pop_front();
-    stats.memoryFunctionLatency.sample((curTick() - next_memory_function_tick)
-                                                * 1e9 / getClockFrequency());
-    DPRINTF(CoalesceEngine, "%s: Popped a function from memoryFunctionQueue. "
-                                "memoryFunctionQueue.size = %d.\n", __func__,
-                                memoryFunctionQueue.size());
+    while (!temp_deque.empty()) {
+        memAccBuffer.push_front(temp_deque.back());
+        temp_deque.pop_back();
+    }
 
     assert(!nextMemoryEvent.pending());
     assert(!nextMemoryEvent.scheduled());
-    if ((!memoryFunctionQueue.empty())) {
+    if ((!memAccBuffer.empty())) {
         schedule(nextMemoryEvent, nextCycle());
     }
 
@@ -830,7 +851,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
 
             need_send_pkt = false;
@@ -852,7 +873,7 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
             cacheBlocks[block_index].lastChangedTick = curTick();
             // If an atom is in the activeBuffer,
             // then it is definitely currently active.
-            currentActiveCacheBlocks.push_back(block_index);
+            numActiveBlocksNow.push_back(block_index);
             // NOTE: Residence in the activeBuffer does not
             // signify anything about future activity.
             bool atom_active_future = false;
@@ -861,18 +882,18 @@ CoalesceEngine::processNextRead(int block_index, Tick schedule_tick)
                 atom_active_future |= cacheBlocks[block_index].items[index].activeFuture;
             }
             if (atom_active_future) {
-                futureActiveCacheBlocks.push_back(block_index);
+                numActiveBlocksNext.push_back(block_index);
             }
 
             need_send_pkt = false;
             ab = activeBuffer.erase(ab);
             delete ab_pkt;
             if (pullCondition()) {
-                memoryFunctionQueue.emplace_back(
+                memAccBuffer.emplace_back(
                     [this] (int ignore, Tick schedule_tick) {
                         processNextVertexPull(ignore, schedule_tick);
-                    }, 0, curTick());
-                pullsScheduled++;
+                    }, -1, curTick());
+                numScheduledPulls++;
             }
         } else {
             ab++;
@@ -966,20 +987,20 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
                         "Addr: %lu, size = %d.\n", __func__,
                         pkt->getAddr(), pkt->getSize());
         if (atom_active_future) {
-            futureActiveCacheBlocks.erase(block_index);
+            numActiveBlocksNext.erase(block_index);
         }
         if (atom_active_now) {
-            currentActiveCacheBlocks.erase(block_index);
+            numActiveBlocksNow.erase(block_index);
             if (enoughSpace()) {
                 activeBuffer.emplace_back(pkt, curTick());
             } else {
                 int count = currentDirectory->activate(cacheBlocks[block_index].addr);
                 stats.currentFrontierSize.sample(currentDirectory->workCount());
-                stats.currentBlockActiveCount.sample(count);
+                stats.countActiveBlocksNow.sample(count);
                 if (atom_active_future) {
                     int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                     stats.futureFrontierSize.sample(futureDirectory->workCount());
-                    stats.futureBlockActiveCount.sample(count);
+                    stats.countActiveBlocksNext.sample(count);
                 }
                 memPort.sendPacket(pkt);
                 onTheFlyReqs++;
@@ -988,7 +1009,7 @@ CoalesceEngine::processNextWriteBack(int block_index, Tick schedule_tick)
             if (atom_active_future) {
                 int count = futureDirectory->activate(cacheBlocks[block_index].addr);
                 stats.futureFrontierSize.sample(futureDirectory->workCount());
-                stats.futureBlockActiveCount.sample(count);
+                stats.countActiveBlocksNext.sample(count);
             }
             memPort.sendPacket(pkt);
             onTheFlyReqs++;
@@ -1033,7 +1054,7 @@ void
 CoalesceEngine::processNextVertexPull(int ignore, Tick schedule_tick)
 {
     DPRINTF(CoalesceEngine, "%s: processNextVertexPull called.\n", __func__);
-    pullsScheduled--;
+    numScheduledPulls--;
     if (!currentDirectory->empty()) {
         Addr addr = currentDirectory->getNextWork();
         int block_index = getBlockIndex(addr);
@@ -1081,14 +1102,14 @@ CoalesceEngine::recvMemRetry()
 int
 CoalesceEngine::workCount()
 {
-    return currentActiveCacheBlocks.size() + currentDirectory->workCount() + activeBuffer.size();
+    return numActiveBlocksNow.size() + currentDirectory->workCount() + activeBuffer.size();
 }
 
 void
 CoalesceEngine::recvVertexPull()
 {
-    pullsReceived++;
-    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. pullsReceived: %d.\n", __func__, pullsReceived);
+    numReceivedPulls++;
+    DPRINTF(CoalesceEngine, "%s: Received a vertex pull. numReceivedPulls: %d.\n", __func__, numReceivedPulls);
 
     stats.verticesPulled++;
     stats.lastVertexPullTime = curTick() - stats.lastResetTick;
@@ -1109,14 +1130,14 @@ CoalesceEngine::processNextApplyEvent()
         std::tie(pkt, entrance_tick) = activeBuffer.front();
         pkt->writeDataToBlock((uint8_t*) items, peerMemoryAtomSize);
 
-        for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+        for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
             if (items[index].activeNow) {
                 Addr addr = pkt->getAddr() + index * sizeof(WorkListItem);
                 uint32_t delta = graphWorkload->apply(items[index]);
                 items[index].activeNow = false;
                 owner->recvVertexPush(addr, delta, items[index].edgeIndex,
                                                     items[index].degree);
-                pullsReceived--;
+                numReceivedPulls--;
                 stats.verticesPushed++;
                 stats.lastVertexPushTime = curTick() - stats.lastResetTick;
             }
@@ -1135,23 +1156,23 @@ CoalesceEngine::processNextApplyEvent()
                                         peerMemoryAtomSize, (uint8_t*) items);
             postPushWBQueue.emplace_back(wb_pkt, curTick());
             activeBuffer.pop_front();
-            memoryFunctionQueue.emplace_back(
+            memAccBuffer.emplace_back(
                 [this] (int ignore, Tick schedule_tick) {
                     processNextPostPushWB(ignore, schedule_tick);
-                }, 0, curTick());
+                }, -1, curTick());
             if ((!nextMemoryEvent.pending()) &&
                 (!nextMemoryEvent.scheduled())) {
                 schedule(nextMemoryEvent, nextCycle());
             }
             delete pkt;
         }
-    } else if (!currentActiveCacheBlocks.empty()) {
+    } else if (!numActiveBlocksNow.empty()) {
         int num_visited_indices = 0;
-        int initial_fifo_length = currentActiveCacheBlocks.size();
+        int initial_fifo_length = numActiveBlocksNow.size();
         while (true) {
-            int block_index = currentActiveCacheBlocks.front();
+            int block_index = numActiveBlocksNow.front();
             if (cacheBlocks[block_index].state == CacheState::IDLE) {
-                for (int index = 0; (index < numElementsPerLine) && (pullsReceived > 0); index++) {
+                for (int index = 0; (index < numElementsPerLine) && (numReceivedPulls > 0); index++) {
                     if (cacheBlocks[block_index].items[index].activeNow) {
                         Addr addr = cacheBlocks[block_index].addr + index * sizeof(WorkListItem);
                         uint32_t delta = graphWorkload->apply(cacheBlocks[block_index].items[index]);
@@ -1160,7 +1181,7 @@ CoalesceEngine::processNextApplyEvent()
                         owner->recvVertexPush(addr, delta,
                             cacheBlocks[block_index].items[index].edgeIndex,
                             cacheBlocks[block_index].items[index].degree);
-                        pullsReceived--;
+                        numReceivedPulls--;
                         stats.verticesPushed++;
                         stats.lastVertexPushTime = curTick() - stats.lastResetTick;
                     }
@@ -1172,14 +1193,14 @@ CoalesceEngine::processNextApplyEvent()
                 }
                 // NOTE: If we have reached the last item in the cache block
                 if (!atom_active_now) {
-                    currentActiveCacheBlocks.erase(block_index);
+                    numActiveBlocksNow.erase(block_index);
                 }
                 break;
             }
             // NOTE: If the block with index at the front of activeCacheBlocks
             // is not in IDLE state, then roll the that index to the back
-            currentActiveCacheBlocks.pop_front();
-            currentActiveCacheBlocks.push_back(block_index);
+            numActiveBlocksNow.pop_front();
+            numActiveBlocksNow.push_back(block_index);
             // NOTE: If we have visited all the items initially in the FIFO.
             num_visited_indices++;
             if (num_visited_indices == initial_fifo_length) {
@@ -1192,18 +1213,18 @@ CoalesceEngine::processNextApplyEvent()
     }
 
     if (pullCondition()) {
-        memoryFunctionQueue.emplace_back(
+        memAccBuffer.emplace_back(
             [this] (int ignore, Tick schedule_tick) {
                 processNextVertexPull(ignore, schedule_tick);
-            }, 0, curTick());
+            }, -1, curTick());
         if ((!nextMemoryEvent.pending()) &&
             (!nextMemoryEvent.scheduled())) {
             schedule(nextMemoryEvent, nextCycle());
         }
-        pullsScheduled++;
+        numScheduledPulls++;
     }
 
-    if ((pullsReceived > 0) && (!nextApplyEvent.scheduled())) {
+    if ((numReceivedPulls > 0) && (!nextApplyEvent.scheduled())) {
         schedule(nextApplyEvent, nextCycle());
     }
 }
@@ -1261,13 +1282,13 @@ CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
              "Histogram of the length of the current bitvector."),
     ADD_STAT(futureFrontierSize, statistics::units::Count::get(),
              "Histogram of the length of the future bitvector."),
-    ADD_STAT(currentBlockActiveCount, statistics::units::Count::get(),
+    ADD_STAT(countActiveBlocksNow, statistics::units::Count::get(),
              "Histogram of the popCount values in the current directory"),
-    ADD_STAT(futureBlockActiveCount, statistics::units::Count::get(),
+    ADD_STAT(countActiveBlocksNext, statistics::units::Count::get(),
              "Histogram of the popCount values in the future directory"),
     ADD_STAT(responseQueueLatency, statistics::units::Second::get(),
              "Histogram of the response latency to WLEngine. (ns)"),
-    ADD_STAT(memoryFunctionLatency, statistics::units::Second::get(),
+    ADD_STAT(memAccBufferLat, statistics::units::Second::get(),
              "Histogram of the latency of processing a memory function.")
 {
 }
@@ -1286,10 +1307,10 @@ CoalesceEngine::CoalesceStats::regStats()
 
     currentFrontierSize.init(64);
     futureFrontierSize.init(64);
-    currentBlockActiveCount.init(64);
-    futureBlockActiveCount.init(64);
+    countActiveBlocksNow.init(64);
+    countActiveBlocksNext.init(64);
     responseQueueLatency.init(64);
-    memoryFunctionLatency.init(64);
+    memAccBufferLat.init(64);
 }
 
 void
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index f01475118a..4066c7dbe5 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,6 +43,8 @@
 namespace gem5
 {
 
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> FunctionDeque;
+
 class MPU;
 
 class CoalesceEngine : public BaseMemoryEngine
@@ -120,12 +122,12 @@ class CoalesceEngine : public BaseMemoryEngine
     std::deque<std::tuple<Addr, WorkListItem, Tick>> responseQueue;
 
     // Tracking work in cache
-    int pullsReceived;
+    int numReceivedPulls;
     // NOTE: Remember to erase from these upon eviction from cache
-    UniqueFIFO<int> currentActiveCacheBlocks;
-    UniqueFIFO<int> futureActiveCacheBlocks;
+    UniqueFIFO<int> numActiveBlocksNow;
+    UniqueFIFO<int> numActiveBlocksNext;
 
-    int pullsScheduled;
+    int numScheduledPulls;
     int pendingPullLimit;
     int pendingPullReads;
     // A map from addr to sendMask. sendMask determines which bytes to
@@ -141,14 +143,15 @@ class CoalesceEngine : public BaseMemoryEngine
     bool pullCondition();
     int getBlockIndex(Addr addr);
 
+    int transitionsPerCycle;
+    FunctionDeque memAccBuffer;
+
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
     void processNextRead(int block_index, Tick schedule_tick);
     void processNextWriteBack(int block_index, Tick schedule_tick);
     void processNextVertexPull(int ignore, Tick schedule_tick);
     void processNextPostPushWB(int ignore, Tick schedule_tick);
-    std::deque<std::tuple<
-        std::function<void(int, Tick)>, int, Tick>> memoryFunctionQueue;
 
     EventFunctionWrapper nextResponseEvent;
     void processNextResponseEvent();
@@ -192,10 +195,10 @@ class CoalesceEngine : public BaseMemoryEngine
 
         statistics::Histogram currentFrontierSize;
         statistics::Histogram futureFrontierSize;
-        statistics::Histogram currentBlockActiveCount;
-        statistics::Histogram futureBlockActiveCount;
+        statistics::Histogram countActiveBlocksNow;
+        statistics::Histogram countActiveBlocksNext;
         statistics::Histogram responseQueueLatency;
-        statistics::Histogram memoryFunctionLatency;
+        statistics::Histogram memAccBufferLat;
     };
 
     CoalesceStats stats;

From 4ee3ac37bb8bbe88ba4e61be2be5e3efe2a433a0 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 9 Apr 2023 18:47:46 -0700
Subject: [PATCH 270/287] Improving the performance of pushEngine.

---
 src/accl/graph/sega/push_engine.cc | 45 ++++++++++++++++++++++++------
 src/accl/graph/sega/push_engine.hh |  2 +-
 2 files changed, 38 insertions(+), 9 deletions(-)

diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index de3764a605..893643c510 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -295,6 +295,7 @@ PushEngine::handleMemResp(PacketPtr pkt)
         metaEdgeQueue.emplace_back(meta_edge, curTick());
         stats.edgeQueueLength.sample(metaEdgeQueue.size());
     }
+    stats.edgeQueueLength.sample(metaEdgeQueue.size());
     stats.numWastefulEdgesRead +=
                 (peerMemoryAtomSize / sizeof(Edge)) - push_info.numElements;
 
@@ -313,38 +314,57 @@ void
 PushEngine::processNextPropagateEvent()
 {
     int num_propagates = 0;
+    int num_tries = 0;
+    int num_reads = 0;
+    std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
+    for (int i = 0; i < maxPropagatesPerCycle; i++) {
+        if (metaEdgeQueue.empty()) {
+            break;
+        }
+        temp_edge.push_back(metaEdgeQueue.front());
+        metaEdgeQueue.pop_front();
+    }
+    int max_visits = temp_edge.size();
+
     while(true) {
         MetaEdge meta_edge;
         Tick entrance_tick;
-        std::tie(meta_edge, entrance_tick) = metaEdgeQueue.front();
+        std::tie(meta_edge, entrance_tick) = temp_edge.front();
 
         DPRINTF(PushEngine, "%s: The edge to process is %s.\n",
                                 __func__, meta_edge.to_string());
 
         uint32_t update_value =
                 graphWorkload->propagate(meta_edge.value, meta_edge.weight);
-        metaEdgeQueue.pop_front();
+        temp_edge.pop_front();
+        num_tries++;
 
         if (enqueueUpdate(meta_edge.src, meta_edge.dst, update_value)) {
             DPRINTF(PushEngine, "%s: Sent %s to port queues.\n",
                                             __func__, meta_edge.to_string());
+            num_reads++;
             stats.numPropagates++;
             stats.edgeQueueLatency.sample(
                     (curTick() - entrance_tick) * 1e9 / getClockFrequency());
-            stats.edgeQueueLength.sample(metaEdgeQueue.size());
         } else {
-            metaEdgeQueue.emplace_back(meta_edge, entrance_tick);
+            temp_edge.emplace_back(meta_edge, entrance_tick);
+            stats.updateQueueFull++;
         }
         num_propagates++;
 
-        if (metaEdgeQueue.empty()) {
+        if (temp_edge.empty()) {
             break;
         }
-        if (num_propagates >= maxPropagatesPerCycle) {
+        if (num_tries >= max_visits) {
             break;
         }
     }
 
+    while (!temp_edge.empty()) {
+        metaEdgeQueue.push_front(temp_edge.back());
+        temp_edge.pop_back();
+    }
+
     stats.numPropagatesHist.sample(num_propagates);
 
     assert(!nextPropagateEvent.scheduled());
@@ -370,6 +390,11 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
 
     assert(destinationQueues[port_id].size() == sourceAndValueMaps[port_id].size());
 
+    int num_updates = 0;
+    for (auto queue: destinationQueues) {
+        num_updates += queue.size();
+    }
+
     if (sourceAndValueMaps[port_id].find(dst) != sourceAndValueMaps[port_id].end()) {
         DPRINTF(PushEngine, "%s: Found an existing update "
                             "for dst: %lu.\n", __func__, dst);
@@ -385,7 +410,7 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
                             prev_src, dst, new_val);
         stats.updateQueueCoalescions++;
         return true;
-    } else if (destinationQueues[port_id].size() < updateQueueSize) {
+    } else if (num_updates < (updateQueueSize * destinationQueues.size())) {
         DPRINTF(PushEngine, "%s: There is a free entry available "
                             "in queue for port %d.\n", __func__, port_id);
         destinationQueues[port_id].emplace_back(dst, curTick());
@@ -401,6 +426,8 @@ PushEngine::enqueueUpdate(Addr src, Addr dst, uint32_t value)
         }
         return true;
     }
+    DPRINTF(PushEngine, "%s: DestinationQueue for pot %d is blocked.\n",
+                            __func__, port_id);
     return false;
 }
 
@@ -468,6 +495,8 @@ PushEngine::PushStats::PushStats(PushEngine &_push)
     push(_push),
     ADD_STAT(numPropagates, statistics::units::Count::get(),
              "Number of propagate operations done."),
+    ADD_STAT(updateQueueFull, statistics::units::Count::get(),
+             "Number of times the update queue returns false."),
     ADD_STAT(numNetBlocks, statistics::units::Count::get(),
              "Number of updates blocked by network."),
     // ADD_STAT(numIdleCycles, statistics::units::Count::get(),
@@ -508,7 +537,7 @@ PushEngine::PushStats::regStats()
     edgeQueueLatency.init(64);
     edgeQueueLength.init(64);
     updateQueueLength.init(64);
-    numPropagatesHist.init(push.params().max_propagates_per_cycle);
+    numPropagatesHist.init(1 + push.params().max_propagates_per_cycle);
 }
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 9f489455ac..08a5d278f5 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -172,8 +172,8 @@ class PushEngine : public BaseMemoryEngine
 
       statistics::Scalar numMemoryBlocks;
       statistics::Scalar numPropagates;
+      statistics::Scalar updateQueueFull;
       statistics::Scalar numNetBlocks;
-    //   statistics::Scalar numIdleCycles;
       statistics::Scalar updateQueueCoalescions;
       statistics::Scalar numUpdates;
       statistics::Scalar numWastefulEdgesRead;

From c064cea56064e31707abb78a166ee2c611794518 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Tue, 4 Apr 2023 22:18:41 -0700
Subject: [PATCH 271/287] Initial commit for PG.

---
 configs/accl/bfs.py                        |   3 +-
 configs/accl/sega.py                       |  31 ++-
 src/accl/graph/base/data_structs.hh        |  35 ++++
 src/accl/graph/sega/CenteralController.py  |   4 +
 src/accl/graph/sega/centeral_controller.cc | 225 ++++++++++++++++++++-
 src/accl/graph/sega/centeral_controller.hh | 106 +++++++++-
 src/accl/graph/sega/coalesce_engine.cc     |   2 +-
 src/accl/graph/sega/enums.cc               |   3 +
 src/accl/graph/sega/enums.hh               |  10 +
 src/accl/graph/sega/wl_engine.cc           |   5 +
 10 files changed, 418 insertions(+), 6 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 6c3384a2d7..9a280f116b 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -131,7 +131,8 @@ def get_inputs():
 
     m5.instantiate()
 
-    system.set_async_mode()
+    # system.set_async_mode()
+    system.set_pg_mode()
     system.create_pop_count_directory(64)
     if visited:
         system.create_bfs_visited_workload(init_addr, init_value)
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 58a8caddde..ca1f4b9381 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -137,7 +137,13 @@ def setPort(self, port):
 
 
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, graph_path):
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
         super(SEGA, self).__init__()
         # num_gpts should be an even power of 2
         assert num_gpts != 0
@@ -151,8 +157,26 @@ def __init__(self, num_gpts, num_registers, cache_size, graph_path):
         self.mem_mode = "timing"
 
         # Building the CenteralController
+        self.mirror_mem = SimpleMemory(
+            latency="90ns",
+            latency_var="0ns",
+            bandwidth="28GiB/s",
+            image_file=f"{graph_path}/mirrors",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.map_mem = SimpleMemory(
+            latency="90ns",
+            latency_var="0ns",
+            bandwidth="28GiB/s",
+            image_file=f"{graph_path}/mirrors_map",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
         self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices"
+            vertex_image_file=f"{graph_path}/vertices",
+            mirrors_mem=self.mirror_mem.port,
+            mirrors_map_mem=self.map_mem.port,
         )
         # Building the EdgeMemories
         edge_mem = []
@@ -193,6 +217,9 @@ def set_async_mode(self):
     def set_bsp_mode(self):
         self.ctrl.setBSPMode()
 
+    def set_pg_mode(self):
+        self.ctrl.setPGMode()
+
     def create_pop_count_directory(self, atoms_per_block):
         self.ctrl.createPopCountDirectory(atoms_per_block)
 
diff --git a/src/accl/graph/base/data_structs.hh b/src/accl/graph/base/data_structs.hh
index 60391b3a7c..f1a26f6ac2 100644
--- a/src/accl/graph/base/data_structs.hh
+++ b/src/accl/graph/base/data_structs.hh
@@ -94,8 +94,43 @@ struct __attribute__ ((packed)) Edge
     {}
 };
 
+struct __attribute__ ((packed)) MirrorVertex
+{
+    uint32_t vertexId : 32;
+    uint32_t prop : 32;
+    uint32_t edgeIndex : 32;
+    uint32_t degree : 30;
+    bool activeNow: 1;
+    bool activeNext: 1;
+
+    std::string to_string()
+    {
+        return csprintf("MirrorVertex{vertexId: %u, prop: %u, edgeIndex: %u, "
+                        "degree: %u, activeNow: %s, activeNext: %s}",
+                        vertexId, prop, edgeIndex, degree,
+                        activeNow ? "true" : "false",
+                        activeNext ? "true" : "false");
+    }
+    MirrorVertex():
+        vertexId(-1),
+        prop(-1),
+        edgeIndex(-1),
+        degree(-1),
+        activeNow(false),
+        activeNext(false)
+    {}
+
+    MirrorVertex(uint32_t vertex_id, uint32_t prop, uint32_t degree,
+                uint32_t edge_index, bool active_now, bool active_next):
+                vertexId(vertex_id), prop(prop), edgeIndex(edge_index),
+                degree(degree), activeNow(active_now), activeNext(active_next)
+    {}
+
+};
+
 static_assert(isPowerOf2(sizeof(WorkListItem)));
 static_assert(isPowerOf2(sizeof(Edge)));
+static_assert(isPowerOf2(sizeof(MirrorVertex)));
 
 struct MetaEdge {
     uint64_t src;
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index 211b1a694b..a61ca133a1 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -35,6 +35,9 @@ class CenteralController(ClockedObject):
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
+    mirrors_mem = RequestPort("Port to a memory storing vertex mirrors file.")
+    mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
+
     system = Param.System(Parent.any, "System this Engine is a part of")
 
     vertex_image_file = Param.String("Path to the vertex image file.")
@@ -46,6 +49,7 @@ class CenteralController(ClockedObject):
     cxx_exports = [
                     PyBindMethod("setAsyncMode"),
                     PyBindMethod("setBSPMode"),
+                    PyBindMethod("setPGMode"),
                     PyBindMethod("createPopCountDirectory"),
                     PyBindMethod("createBFSWorkload"),
                     PyBindMethod("createBFSVisitedWorkload"),
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 46c6133947..72b7914d45 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -43,7 +43,13 @@ namespace gem5
 CenteralController::CenteralController(const Params& params):
     ClockedObject(params),
     system(params.system),
-    mode(ProcessingMode::NOT_SET)
+    mirrorsPort("mirrors_mem", this, 0), mapPort("map_port", this, 1),
+    mode(ProcessingMode::NOT_SET), currentSliceNumber(0), totalSliceNumber(148),
+    lastReadPacketId(0),
+    nextMirrorMapReadEvent([this] { processNextMirrorMapReadEvent(); }, name()),
+    nextMirrorReadEvent([this] { processNextMirrorReadEvent(); }, name()),
+    nextMirrorUpdateEvent([this] { processNextMirrorUpdateEvent(); }, name()),
+    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name())
 {
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
@@ -56,6 +62,18 @@ CenteralController::CenteralController(const Params& params):
     }
 }
 
+Port&
+CenteralController::getPort(const std::string& if_name, PortID idx)
+{
+    if (if_name == "mirrors_mem") {
+        return mirrorsPort;
+    } else if (if_name == "mirrors_map_mem") {
+        return mapPort;
+    } else {
+        return ClockedObject::getPort(if_name, idx);
+    }
+}
+
 void
 CenteralController::createBFSWorkload(Addr init_addr, uint32_t init_value)
 {
@@ -113,6 +131,11 @@ CenteralController::createPopCountDirectory(int atoms_per_block)
             mpu->createBSPPopCountDirectory(atoms_per_block);
         }
     }
+    if (mode == ProcessingMode::POLY_GRAPH) {
+        for (auto mpu: mpuVector) {
+            mpu->createAsyncPopCountDirectory(atoms_per_block);
+        }
+    }
 }
 
 void
@@ -157,6 +180,45 @@ CenteralController::startup()
     workload->iterate();
 }
 
+void
+CenteralController::ReqPort::sendPacket(PacketPtr pkt)
+{
+    panic_if(blockedPacket != nullptr,
+            "Should never try to send if blocked!");
+    // If we can't send the packet across the port, store it for later.
+    if (!sendTimingReq(pkt))
+    {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s is blocked.\n", __func__, _id, pkt->print());
+        blockedPacket = pkt;
+    } else {
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s sent.\n", __func__, _id, pkt->print());
+    }
+}
+
+bool
+CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
+{
+    DPRINTF(CenteralController, "%s: Port %d received pkt: %s.\n", __func__, _id, pkt->print());
+    return owner->handleMemResp(pkt, _id);
+}
+
+void
+CenteralController::ReqPort::recvReqRetry()
+{
+    panic_if(blockedPacket == nullptr,
+            "Received retry without a blockedPacket.");
+
+    DPRINTF(CenteralController, "%s: ReqPort %d received a reqRetry. "
+            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
+    PacketPtr pkt = blockedPacket;
+    blockedPacket = nullptr;
+    sendPacket(pkt);
+    if (blockedPacket == nullptr) {
+        DPRINTF(CenteralController, "%s: blockedPacket sent successfully.\n", __func__);
+        owner->recvReqRetry(_id);
+    }
+}
+
 PacketPtr
 CenteralController::createReadPacket(Addr addr, unsigned int size)
 {
@@ -199,8 +261,169 @@ CenteralController::recvDoneSignal()
         workload->iterate();
         exitSimLoopNow("finished an iteration.");
     }
+
+    if (done && mode == ProcessingMode::POLY_GRAPH) {
+        // assert(!nextMirrorMapReadEvent.scheduled());
+        if (!nextMirrorMapReadEvent.scheduled()) {
+            schedule(nextMirrorMapReadEvent, nextCycle());
+        }
+    }
+}
+
+void
+CenteralController::processNextMirrorMapReadEvent()
+{
+    // TODO: In future add functionality to align start_addr and end_addr to
+    // size of the vertex atom.
+    Addr start_addr = currentSliceNumber * totalSliceNumber * sizeof(int);
+    Addr end_addr = start_addr + totalSliceNumber * sizeof(int);
+    PacketPtr start = createReadPacket(start_addr, sizeof(int));
+    PointerTag* start_tag = new PointerTag(lastReadPacketId, PointerType::START);
+    start->pushSenderState(start_tag);
+    PacketPtr end = createReadPacket(end_addr, sizeof(int));
+    PointerTag* end_tag = new PointerTag(lastReadPacketId, PointerType::END);
+    end->pushSenderState(end_tag);
+    lastReadPacketId++;
+    mapPort.sendPacket(start);
+    mapPort.sendPacket(end);
+}
+
+bool
+CenteralController::handleMemResp(PacketPtr pkt, PortID id)
+{
+    assert(pkt->isResponse());
+    if (id == 0) {
+        if (pkt->isWrite()) {
+            delete pkt;
+            return true;
+        }
+        assert(reqInfoMap.find(pkt->req) != reqInfoMap.end());
+        Addr offset;
+        int num_mirrors;
+        int pkt_size_in_mirrors = pkt->getSize() / sizeof(MirrorVertex);
+        MirrorVertex data[pkt_size_in_mirrors];
+        pkt->writeDataToBlock((uint8_t*) data, pkt->getSize());
+
+        std::tie(offset, num_mirrors) = reqInfoMap[pkt->req];
+        assert(num_mirrors > 0);
+        offset = (int) (offset / sizeof(MirrorVertex));
+        for (int i = 0; i < num_mirrors; i++) {
+            mirrorQueue.push_back(data[i + offset]);
+        }
+        delete pkt;
+
+        if (!nextMirrorUpdateEvent.scheduled()) {
+            schedule(nextMirrorUpdateEvent, nextCycle());
+        }
+        return true;
+    } else if (id == 1) {
+        PointerTag* tag = pkt->findNextSenderState<PointerTag>();
+        int read_id = tag->Id();
+        PointerType read_type = tag->type();
+        if (read_type == PointerType::START) {
+            assert(startAddrs.find(read_id) == startAddrs.end());
+            startAddrs[read_id] = pkt->getLE<int>();
+            if (endAddrs.find(read_id) != endAddrs.end()) {
+                int vertex_atom = mpuVector.front()->vertexAtomSize();
+                mirrorPointerQueue.emplace_back(
+                    startAddrs[read_id], endAddrs[read_id],
+                    sizeof(MirrorVertex), vertex_atom);
+                if (!nextMirrorReadEvent.scheduled()) {
+                    schedule(nextMirrorReadEvent, nextCycle());
+                }
+            }
+        } else {
+            assert(read_type == PointerType::END);
+            assert(endAddrs.find(read_id) == endAddrs.end());
+            endAddrs[read_id] = pkt->getLE<int>();
+            if (startAddrs.find(read_id) != startAddrs.end()) {
+                int vertex_atom = mpuVector.front()->vertexAtomSize();
+                mirrorPointerQueue.emplace_back(
+                    startAddrs[read_id], endAddrs[read_id],
+                    sizeof(MirrorVertex), vertex_atom);
+                if (!nextMirrorReadEvent.scheduled()) {
+                    schedule(nextMirrorReadEvent, nextCycle());
+                }
+            }
+        }
+        DPRINTF(CenteralController, "%s: Received pkt: %s from port %d "
+                                    "with value: %d.\n", __func__,
+                                    pkt->print(), id, pkt->getLE<int>());
+        delete tag;
+        delete pkt;
+        return true;
+    } else {
+        panic("did not expect this.");
+    }
+}
+
+void
+CenteralController::recvReqRetry(PortID id) {
+    if (id == 0) {
+        assert(!nextMirrorReadEvent.scheduled());
+        if (!mirrorPointerQueue.empty()) {
+            schedule(nextMirrorReadEvent, nextCycle());
+        }
+    } else if (id == 1) {
+        DPRINTF(CenteralController, "%s: Ignoring reqRetry "
+                            "for port %d.\n", __func__, id);
+    } else {
+        panic("Did not expect the other.");
+    }
+}
+
+void
+CenteralController::processNextMirrorReadEvent()
+{
+    Addr aligned_addr, offset;
+    int num_mirrors;
+
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    MirrorReadInfoGen& front = mirrorPointerQueue.front();
+    std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
+    PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
+    mirrorsPort.sendPacket(pkt);
+    reqInfoMap[pkt->req] = std::make_tuple(offset, num_mirrors);
+    front.iterate();
+    if (front.done()) {
+        mirrorPointerQueue.pop_front();
+    }
+
+    if (!mirrorPointerQueue.empty() && !mirrorsPort.blocked()) {
+        schedule(nextMirrorReadEvent, nextCycle());
+    }
+}
+
+void
+CenteralController::processNextMirrorUpdateEvent()
+{
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    MirrorVertex front = mirrorQueue.front();
+    Addr org_addr = front.vertexId * sizeof(WorkListItem);
+    Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+    int wl_offset = (aligned_org_addr - org_addr) / sizeof(WorkListItem);
+    int num_items = vertex_atom / sizeof(WorkListItem);
+    WorkListItem data[num_items];
+
+    PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+    for (auto mpu: mpuVector) {
+        AddrRangeList range_list = addrRangeListMap[mpu];
+        if (contains(range_list, org_addr)) {
+            mpu->recvFunctional(read_org);
+        }
+    }
+    read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
+    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__, workload->printWorkListItem(data[wl_offset]), front.to_string());
+    std::cout << workload->printWorkListItem(data[wl_offset]) << std::endl;
+    mirrorQueue.pop_front();
+    if (!mirrorQueue.empty()) {
+        schedule(nextMirrorUpdateEvent, nextCycle());
+    }
 }
 
+void
+CenteralController::processNextWriteBackEvent() {}
+
 int
 CenteralController::workCount()
 {
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index e1f3f413b5..d99f26405f 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -37,6 +37,7 @@
 #include "accl/graph/sega/mpu.hh"
 #include "accl/graph/sega/router_engine.hh"
 #include "base/addr_range.hh"
+#include "base/intmath.hh"
 #include "params/CenteralController.hh"
 #include "sim/clocked_object.hh"
 #include "sim/system.hh"
@@ -47,7 +48,79 @@ namespace gem5
 class CenteralController : public ClockedObject
 {
   private:
+    class ReqPort : public RequestPort
+    {
+      private:
+        CenteralController* owner;
+        PacketPtr blockedPacket;
+        PortID _id;
+
+      public:
+        ReqPort(const std::string& name, CenteralController* owner, PortID id):
+          RequestPort(name, owner),
+          owner(owner), blockedPacket(nullptr), _id(id)
+        {}
+        void sendPacket(PacketPtr pkt);
+        bool blocked() { return (blockedPacket != nullptr); }
+
+      protected:
+        virtual bool recvTimingResp(PacketPtr pkt);
+        virtual void recvReqRetry();
+    };
+
+    struct PointerTag : public Packet::SenderState
+    {
+        int _id;
+        PointerType _type;
+        PointerTag(int id, PointerType type): _id(id), _type(type) {}
+        int Id() { return _id; }
+        PointerType type() { return _type; }
+
+    };
+
+    class MirrorReadInfoGen {
+      private:
+        Addr _start;
+        Addr _end;
+        size_t _step;
+        size_t _atom;
+
+      public:
+        MirrorReadInfoGen(Addr start, Addr end, size_t step, size_t atom):
+                        _start(start), _end(end), _step(step), _atom(atom)
+        {}
+
+        std::tuple<Addr, Addr, int> nextReadPacketInfo()
+        {
+            panic_if(done(), "Should not call nextPacketInfo when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            Addr offset = _start - aligned_addr;
+            int num_items = 0;
+
+            if (_end > (aligned_addr + _atom)) {
+                num_items = (_atom - offset) / _step;
+            } else {
+                num_items = (_end - _start) / _step;
+            }
+
+            return std::make_tuple(aligned_addr, offset, num_items);
+        }
+
+        void iterate()
+        {
+            panic_if(done(), "Should not call iterate when done.\n");
+            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
+            _start = aligned_addr + _atom;
+        }
+
+        bool done() { return (_start >= _end); }
+    };
+
     System* system;
+
+    ReqPort mirrorsPort;
+    ReqPort mapPort;
+
     Addr maxVertexAddr;
 
     ProcessingMode mode;
@@ -57,18 +130,49 @@ class CenteralController : public ClockedObject
 
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
+    // FIXME: Initialize these two.
+    int currentSliceNumber;
+    int totalSliceNumber;
+    int lastReadPacketId;
+    std::unordered_map<int, Addr> startAddrs;
+    std::unordered_map<int, Addr> endAddrs;
+    // TODO: Set a max size for this queue;
+    std::deque<MirrorReadInfoGen> mirrorPointerQueue;
+    std::unordered_map<RequestPtr, std::tuple<Addr, int>> reqInfoMap;
+
+    std::deque<MirrorVertex> mirrorQueue;
+    std::deque<PacketPtr> writeBackQueue;
+
     PacketPtr createReadPacket(Addr addr, unsigned int size);
 
-  public:
+    bool handleMemResp(PacketPtr pkt, PortID id);
+    void recvReqRetry(PortID id);
 
+    EventFunctionWrapper nextMirrorMapReadEvent;
+    void processNextMirrorMapReadEvent();
+
+    EventFunctionWrapper nextMirrorReadEvent;
+    void processNextMirrorReadEvent();
+
+    EventFunctionWrapper nextMirrorUpdateEvent;
+    void processNextMirrorUpdateEvent();
+
+    EventFunctionWrapper nextWriteBackEvent;
+    void processNextWriteBackEvent();
+
+  public:
     GraphWorkload* workload;
 
     PARAMS(CenteralController);
     CenteralController(const CenteralControllerParams &params);
+    Port& getPort(const std::string& if_name,
+                PortID idx = InvalidPortID) override;
+
     virtual void startup() override;
 
     void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
     void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
+    void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
 
     void createPopCountDirectory(int atoms_per_block);
 
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index a2653952e0..083e8d4c37 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -680,7 +680,7 @@ CoalesceEngine::recvWLWrite(Addr addr, WorkListItem wl)
 
     bool active = graphWorkload->activeCondition(wl, cacheBlocks[block_index].items[wl_offset]);
     cacheBlocks[block_index].items[wl_offset] = wl;
-    if (mode == ProcessingMode::ASYNCHRONOUS) {
+    if (mode == ProcessingMode::ASYNCHRONOUS || mode == ProcessingMode::POLY_GRAPH) {
         cacheBlocks[block_index].items[wl_offset].activeNow |= active;
         if (active && (!numActiveBlocksNow.find(block_index))) {
             numActiveBlocksNow.push_back(block_index);
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 2f1bc983eb..5b8de3404f 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -62,7 +62,10 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
 {
     "NOT_SET",
     "ASYNCHRONOUS",
+    "POLY_GRAPH",
     "BULK_SYNCHRONOUS"
 };
 
+const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"N/A", "START", "END"};
+
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 4e7d64235e..92e293bec0 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -74,10 +74,20 @@ enum ProcessingMode
     NOT_SET,
     ASYNCHRONOUS,
     BULK_SYNCHRONOUS,
+    POLY_GRAPH,
     NUM_PROCESSING_MODE
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
+enum PointerType
+{
+    NA,
+    START,
+    END,
+    NUM_POINTER_TYPE
+};
+extern const char* pointerTypeStrings[NUM_POINTER_TYPE];
+
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 5a4a960635..c294441703 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -384,8 +384,13 @@ WLEngine::processNextReduceEvent()
         Addr addr = toReduce.front();
         assert(std::get<0>(registerFile[addr]) == RegisterState::PENDING_REDUCE);
         uint32_t update_value = std::get<1>(registerFile[addr]);
+        DPRINTF(WLEngine, "%s: Reducing for addr: %lu, update_value: %u, "
+                            "temp_prop: %s.\n", __func__, addr,
+                            update_value, workListFile[addr].tempProp);
         workListFile[addr].tempProp =
             graphWorkload->reduce(update_value, workListFile[addr].tempProp);
+        DPRINTF(WLEngine, "%s: Reduction result: %s", __func__,
+                graphWorkload->printWorkListItem(workListFile[addr]));
         registerFile[addr] = std::make_tuple(RegisterState::PENDING_WRITE, update_value);
         num_reduces++;
         stats.numReductions++;

From 2216d7c49dedd1d1d795fa6eccdaa81b4573061b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 5 Apr 2023 05:30:11 -0700
Subject: [PATCH 272/287] Fixing typo in centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 72b7914d45..928425c001 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -188,17 +188,18 @@ CenteralController::ReqPort::sendPacket(PacketPtr pkt)
     // If we can't send the packet across the port, store it for later.
     if (!sendTimingReq(pkt))
     {
-        DPRINTF(CenteralController, "%s: Port %d: Packet %s is blocked.\n", __func__, _id, pkt->print());
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                "is blocked.\n", __func__, _id, pkt->print());
         blockedPacket = pkt;
     } else {
-        DPRINTF(CenteralController, "%s: Port %d: Packet %s sent.\n", __func__, _id, pkt->print());
+        DPRINTF(CenteralController, "%s: Port %d: Packet %s "
+                    "sent.\n", __func__, _id, pkt->print());
     }
 }
 
 bool
 CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
 {
-    DPRINTF(CenteralController, "%s: Port %d received pkt: %s.\n", __func__, _id, pkt->print());
     return owner->handleMemResp(pkt, _id);
 }
 
@@ -214,7 +215,8 @@ CenteralController::ReqPort::recvReqRetry()
     blockedPacket = nullptr;
     sendPacket(pkt);
     if (blockedPacket == nullptr) {
-        DPRINTF(CenteralController, "%s: blockedPacket sent successfully.\n", __func__);
+        DPRINTF(CenteralController, "%s: blockedPacket sent "
+                                "successfully.\n", __func__);
         owner->recvReqRetry(_id);
     }
 }
@@ -401,7 +403,7 @@ CenteralController::processNextMirrorUpdateEvent()
     MirrorVertex front = mirrorQueue.front();
     Addr org_addr = front.vertexId * sizeof(WorkListItem);
     Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-    int wl_offset = (aligned_org_addr - org_addr) / sizeof(WorkListItem);
+    int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
     int num_items = vertex_atom / sizeof(WorkListItem);
     WorkListItem data[num_items];
 
@@ -413,9 +415,10 @@ CenteralController::processNextMirrorUpdateEvent()
         }
     }
     read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
-    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__, workload->printWorkListItem(data[wl_offset]), front.to_string());
-    std::cout << workload->printWorkListItem(data[wl_offset]) << std::endl;
+    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
+            workload->printWorkListItem(data[wl_offset]), front.to_string());
     mirrorQueue.pop_front();
+    delete read_org;
     if (!mirrorQueue.empty()) {
         schedule(nextMirrorUpdateEvent, nextCycle());
     }

From ad1446d70d3a4fc508f8ebb548142c29728737a9 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 6 Apr 2023 19:54:16 -0700
Subject: [PATCH 273/287] Updating centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 79 +++++++++++++---------
 src/accl/graph/sega/centeral_controller.hh |  6 +-
 2 files changed, 50 insertions(+), 35 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 928425c001..4885ca83a3 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -299,21 +299,8 @@ CenteralController::handleMemResp(PacketPtr pkt, PortID id)
             delete pkt;
             return true;
         }
-        assert(reqInfoMap.find(pkt->req) != reqInfoMap.end());
-        Addr offset;
-        int num_mirrors;
-        int pkt_size_in_mirrors = pkt->getSize() / sizeof(MirrorVertex);
-        MirrorVertex data[pkt_size_in_mirrors];
-        pkt->writeDataToBlock((uint8_t*) data, pkt->getSize());
-
-        std::tie(offset, num_mirrors) = reqInfoMap[pkt->req];
-        assert(num_mirrors > 0);
-        offset = (int) (offset / sizeof(MirrorVertex));
-        for (int i = 0; i < num_mirrors; i++) {
-            mirrorQueue.push_back(data[i + offset]);
-        }
+        readQueue.push_back(pkt);
         delete pkt;
-
         if (!nextMirrorUpdateEvent.scheduled()) {
             schedule(nextMirrorUpdateEvent, nextCycle());
         }
@@ -385,7 +372,6 @@ CenteralController::processNextMirrorReadEvent()
     std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
     PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
     mirrorsPort.sendPacket(pkt);
-    reqInfoMap[pkt->req] = std::make_tuple(offset, num_mirrors);
     front.iterate();
     if (front.done()) {
         mirrorPointerQueue.pop_front();
@@ -400,32 +386,59 @@ void
 CenteralController::processNextMirrorUpdateEvent()
 {
     int vertex_atom = mpuVector.front()->vertexAtomSize();
-    MirrorVertex front = mirrorQueue.front();
-    Addr org_addr = front.vertexId * sizeof(WorkListItem);
-    Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-    int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
-    int num_items = vertex_atom / sizeof(WorkListItem);
-    WorkListItem data[num_items];
 
-    PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
-    for (auto mpu: mpuVector) {
-        AddrRangeList range_list = addrRangeListMap[mpu];
-        if (contains(range_list, org_addr)) {
-            mpu->recvFunctional(read_org);
+    int num_mirrors_per_atom = vertex_atom / sizeof(MirrorVertex);
+    int num_vertices_per_atom = vertex_atom / sizeof(WorkListItem);
+    MirrorVertex mirrors[num_mirrors_per_atom];
+    WorkListItem vertices[num_vertices_per_atom];
+
+    PacketPtr front = readQueue.front();
+    front->writeDataToBlock((uint8_t*) mirrors, vertex_atom);
+    for (int i = 0; i < num_mirrors_per_atom; i++) {
+        Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+        Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+        int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+
+        PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+        for (auto mpu: mpuVector) {
+            AddrRangeList range_list = addrRangeListMap[mpu];
+            if (contains(range_list, org_addr)) {
+                mpu->recvFunctional(read_org);
+            }
+        }
+        read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+        DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
+            workload->printWorkListItem(vertices[wl_offset]), front.to_string());
+        delete read_org;
+
+        if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+            assert(data[wl_offset].degree == 0);
+            vertices[wl_offset].prop = vertices[wl_offset].tempProp;
         }
+        if (mirrors[i].prop != vertices[wl_offset].prop) {
+            mirrors[i].prop = vertices[wl_offset].prop;
+            mirrors[i].activeNow = true;
+        }
+    }
+
+    PacketPtr wb = createWritePacket(
+                    front->getAddr(), front->getSize(), (uint8_t*) mirrors);
+    readQueue.pop_front();
+    delete front;
+
+    if (!nextWriteBackEvent.scheduled()) {
+        schedule(nextWriteBackEvent, nextCycle());
     }
-    read_org->writeDataToBlock((uint8_t*) data, vertex_atom);
-    DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
-            workload->printWorkListItem(data[wl_offset]), front.to_string());
-    mirrorQueue.pop_front();
-    delete read_org;
-    if (!mirrorQueue.empty()) {
+    if (!readQueue.empty()) {
         schedule(nextMirrorUpdateEvent, nextCycle());
     }
 }
 
 void
-CenteralController::processNextWriteBackEvent() {}
+CenteralController::processNextWriteBackEvent()
+{
+    PacketPtr front = writeBackQueue.front();
+}
 
 int
 CenteralController::workCount()
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index d99f26405f..60746c0c00 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -133,17 +133,19 @@ class CenteralController : public ClockedObject
     // FIXME: Initialize these two.
     int currentSliceNumber;
     int totalSliceNumber;
+
     int lastReadPacketId;
     std::unordered_map<int, Addr> startAddrs;
     std::unordered_map<int, Addr> endAddrs;
     // TODO: Set a max size for this queue;
     std::deque<MirrorReadInfoGen> mirrorPointerQueue;
-    std::unordered_map<RequestPtr, std::tuple<Addr, int>> reqInfoMap;
 
-    std::deque<MirrorVertex> mirrorQueue;
+    std::deque<PacketPtr> readQueue;
     std::deque<PacketPtr> writeBackQueue;
 
+    int getSliceNumber(Addr vertex_addr);
     PacketPtr createReadPacket(Addr addr, unsigned int size);
+    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
 
     bool handleMemResp(PacketPtr pkt, PortID id);
     void recvReqRetry(PortID id);

From ba126af580d1d61a7e620a48cbbb99c94c3499e5 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Aug 2023 11:47:01 -0700
Subject: [PATCH 274/287] Rebasing

---
 configs/accl/bfs.py                        |  22 +-
 configs/accl/sega.py                       |  11 +-
 src/accl/graph/base/graph_workload.hh      |   2 +
 src/accl/graph/sega/CenteralController.py  |   9 +-
 src/accl/graph/sega/centeral_controller.cc | 387 ++++++++++-----------
 src/accl/graph/sega/centeral_controller.hh | 109 ++----
 src/accl/graph/sega/coalesce_engine.cc     |   2 +-
 src/accl/graph/sega/coalesce_engine.hh     |   6 +-
 src/accl/graph/sega/enums.cc               |   2 +-
 src/accl/graph/sega/enums.hh               |   9 +-
 src/accl/graph/sega/mpu.cc                 |   7 +
 src/accl/graph/sega/mpu.hh                 |   5 +
 src/accl/graph/sega/push_engine.cc         |  24 ++
 src/accl/graph/sega/push_engine.hh         |   3 +
 src/mem/simple_mem.hh                      |   3 +-
 15 files changed, 281 insertions(+), 320 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 9a280f116b..8221badd9d 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -140,20 +140,24 @@ def get_inputs():
         system.create_bfs_workload(init_addr, init_value)
     if sample:
         while True:
-            exit_event = m5.simulate(100000000)
+            exit_event = m5.simulate(50000000)
             print(
                 f"Exited simulation at tick {m5.curTick()} "
                 + f"because {exit_event.getCause()}"
             )
-            m5.stats.dump()
-            m5.stats.reset()
-            if exit_event.getCause() != "simulate() limit reached":
+            if exit_event.getCause() == "simulate() limit reached":
+                m5.stats.dump()
+                m5.stats.reset()
+            elif exit_event.getCause() == "Done with all the slices.":
                 break
     else:
-        exit_event = m5.simulate()
-        print(
-            f"Exited simulation at tick {m5.curTick()} "
-            + f"because {exit_event.getCause()}"
-        )
+        while True:
+            exit_event = m5.simulate()
+            print(
+                f"Exited simulation at tick {m5.curTick()} "
+                + f"because {exit_event.getCause()}"
+            )
+            if exit_event.getCause() == "Done with all the slices.":
+                break
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index ca1f4b9381..982235697a 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -64,6 +64,7 @@ def __init__(self, register_file_size: int, cache_size: str):
             pending_pull_limit=64,
             active_buffer_size=80,
             post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
         )
         self.push_engine = PushEngine(
             push_req_queue_size=32,
@@ -145,7 +146,6 @@ def __init__(
         graph_path,
     ):
         super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
         assert num_gpts != 0
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
@@ -160,23 +160,24 @@ def __init__(
         self.mirror_mem = SimpleMemory(
             latency="90ns",
             latency_var="0ns",
-            bandwidth="28GiB/s",
+            bandwidth="256GiB/s",
             image_file=f"{graph_path}/mirrors",
             range=AddrRange(start=0, size="4GiB"),
             in_addr_map=False,
         )
         self.map_mem = SimpleMemory(
-            latency="90ns",
+            latency="0ns",
             latency_var="0ns",
-            bandwidth="28GiB/s",
+            bandwidth="1024GiB/s",
             image_file=f"{graph_path}/mirrors_map",
             range=AddrRange(start=0, size="4GiB"),
             in_addr_map=False,
         )
         self.ctrl = CenteralController(
             vertex_image_file=f"{graph_path}/vertices",
-            mirrors_mem=self.mirror_mem.port,
+            mem_port=self.mirror_mem.port,
             mirrors_map_mem=self.map_mem.port,
+            mirrors_mem=self.mirror_mem
         )
         # Building the EdgeMemories
         edge_mem = []
diff --git a/src/accl/graph/base/graph_workload.hh b/src/accl/graph/base/graph_workload.hh
index 72748502c1..481cfc146f 100644
--- a/src/accl/graph/base/graph_workload.hh
+++ b/src/accl/graph/base/graph_workload.hh
@@ -51,6 +51,7 @@ class GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value) = 0;
     virtual uint32_t propagate(uint32_t value, uint32_t weight) = 0;
     virtual uint32_t apply(WorkListItem& wl) = 0;
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) { return true; }
     virtual void iterate() = 0;
     virtual void interIterationInit(WorkListItem& wl) = 0;
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl) = 0;
@@ -74,6 +75,7 @@ class BFSWorkload : public GraphWorkload
     virtual uint32_t reduce(uint32_t update, uint32_t value);
     virtual uint32_t propagate(uint32_t value, uint32_t weight);
     virtual uint32_t apply(WorkListItem& wl);
+    virtual bool betterThan(uint32_t lhs, uint32_t rhs) override { return lhs < rhs; }
     virtual void iterate() {}
     virtual void interIterationInit(WorkListItem& wl) {}
     virtual bool activeCondition(WorkListItem new_wl, WorkListItem old_wl);
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index a61ca133a1..cff7e8a036 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -28,20 +28,19 @@
 from m5.params import *
 from m5.proxy import *
 from m5.util.pybind import PyBindMethod
-from m5.objects.ClockedObject import ClockedObject
+from m5.objects.BaseMemoryEngine import BaseMemoryEngine
 
-class CenteralController(ClockedObject):
+class CenteralController(BaseMemoryEngine):
     type = 'CenteralController'
     cxx_header = "accl/graph/sega/centeral_controller.hh"
     cxx_class = 'gem5::CenteralController'
 
-    mirrors_mem = RequestPort("Port to a memory storing vertex mirrors file.")
     mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
 
-    system = Param.System(Parent.any, "System this Engine is a part of")
-
     vertex_image_file = Param.String("Path to the vertex image file.")
 
+    mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
+
     mpu_vector = VectorParam.MPU("All mpus in the system.")
 
     router_vector = VectorParam.RouterEngine("All Routers in the system.")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 4885ca83a3..fdbcfe2838 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -28,6 +28,7 @@
 
 #include "accl/graph/sega/centeral_controller.hh"
 
+#include <cmath>
 #include <iostream>
 
 #include "base/cprintf.hh"
@@ -41,34 +42,32 @@ namespace gem5
 {
 
 CenteralController::CenteralController(const Params& params):
-    ClockedObject(params),
-    system(params.system),
-    mirrorsPort("mirrors_mem", this, 0), mapPort("map_port", this, 1),
-    mode(ProcessingMode::NOT_SET), currentSliceNumber(0), totalSliceNumber(148),
-    lastReadPacketId(0),
-    nextMirrorMapReadEvent([this] { processNextMirrorMapReadEvent(); }, name()),
-    nextMirrorReadEvent([this] { processNextMirrorReadEvent(); }, name()),
-    nextMirrorUpdateEvent([this] { processNextMirrorUpdateEvent(); }, name()),
-    nextWriteBackEvent([this] { processNextWriteBackEvent(); }, name())
+    BaseMemoryEngine(params),
+    mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
+    mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
 {
+    uint64_t total_cache_size = 0;
     for (auto mpu : params.mpu_vector) {
         mpuVector.push_back(mpu);
         mpu->registerCenteralController(this);
+        total_cache_size += mpu->getCacheSize();
     }
 
-    for (auto router : params.router_vector) {
-        routerVector.push_back(router);
-        router->registerCenteralController(this);
-    }
+    // for (auto router : params.router_vector) {
+    //     routerVector.push_back(router);
+    //     router->registerCenteralController(this);
+    // }
+    verticesPerSlice = std::floor(total_cache_size / sizeof(WorkListItem));
 }
 
 Port&
 CenteralController::getPort(const std::string& if_name, PortID idx)
 {
-    if (if_name == "mirrors_mem") {
-        return mirrorsPort;
-    } else if (if_name == "mirrors_map_mem") {
+    if (if_name == "mirrors_map_mem") {
         return mapPort;
+    } else if (if_name == "mem_port") {
+        return BaseMemoryEngine::getPort("mem_port", idx);
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
@@ -143,7 +142,9 @@ CenteralController::startup()
 {
     unsigned int vertex_atom = mpuVector.front()->vertexAtomSize();
     for (auto mpu: mpuVector) {
-        addrRangeListMap[mpu] = mpu->getAddrRanges();
+        for (auto range: mpu->getAddrRanges()) {
+            mpuAddrMap.insert(range, mpu);
+        }
         mpu->setProcessingMode(mode);
         mpu->recvWorkload(workload);
     }
@@ -159,14 +160,20 @@ CenteralController::startup()
     loader::MemoryImage vertex_image = object->buildImage();
     maxVertexAddr = vertex_image.maxAddr();
 
+    int num_total_vertices = (maxVertexAddr / sizeof(WorkListItem));
+    numTotalSlices = std::ceil((double) num_total_vertices / verticesPerSlice);
+
+    numPendingUpdates = new int [numTotalSlices];
+    bestPendingUpdate = new uint32_t [numTotalSlices];
+    for (int i = 0; i < numTotalSlices; i++) {
+        numPendingUpdates[i] = 0;
+        bestPendingUpdate[i] = -1;
+    }
+
     PortProxy vertex_proxy(
     [this](PacketPtr pkt) {
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, pkt->getAddr())) {
-                mpu->recvFunctional(pkt);
-            }
-        }
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
     }, vertex_atom);
 
     panic_if(!vertex_image.write(vertex_proxy), "%s: Unable to write image.");
@@ -200,40 +207,13 @@ CenteralController::ReqPort::sendPacket(PacketPtr pkt)
 bool
 CenteralController::ReqPort::recvTimingResp(PacketPtr pkt)
 {
-    return owner->handleMemResp(pkt, _id);
+    panic("recvTimingResp should not be called at all");
 }
 
 void
 CenteralController::ReqPort::recvReqRetry()
 {
-    panic_if(blockedPacket == nullptr,
-            "Received retry without a blockedPacket.");
-
-    DPRINTF(CenteralController, "%s: ReqPort %d received a reqRetry. "
-            "blockedPacket: %s.\n", __func__, _id, blockedPacket->print());
-    PacketPtr pkt = blockedPacket;
-    blockedPacket = nullptr;
-    sendPacket(pkt);
-    if (blockedPacket == nullptr) {
-        DPRINTF(CenteralController, "%s: blockedPacket sent "
-                                "successfully.\n", __func__);
-        owner->recvReqRetry(_id);
-    }
-}
-
-PacketPtr
-CenteralController::createReadPacket(Addr addr, unsigned int size)
-{
-    RequestPtr req = std::make_shared<Request>(addr, size, 0, 0);
-    // Dummy PC to have PC-based prefetchers latch on; get entropy into higher
-    // bits
-    req->setPC(((Addr) 0) << 2);
-
-    // Embed it in a packet
-    PacketPtr pkt = new Packet(req, MemCmd::ReadReq);
-    pkt->allocate();
-
-    return pkt;
+    panic("recvReqRetry should not be called at all");
 }
 
 void
@@ -244,9 +224,9 @@ CenteralController::recvDoneSignal()
         done &= mpu->done();
     }
 
-    for (auto router : routerVector) {
-        done &= router->done();
-    }
+    // for (auto router : routerVector) {
+    //     done &= router->done();
+    // }
 
     if (done && mode == ProcessingMode::ASYNCHRONOUS) {
         exitSimLoopNow("no update left to process.");
@@ -265,179 +245,175 @@ CenteralController::recvDoneSignal()
     }
 
     if (done && mode == ProcessingMode::POLY_GRAPH) {
-        // assert(!nextMirrorMapReadEvent.scheduled());
-        if (!nextMirrorMapReadEvent.scheduled()) {
-            schedule(nextMirrorMapReadEvent, nextCycle());
+        DPRINTF(CenteralController, "%s: Received done signal.\n", __func__);
+        exitSimLoopNow("Finished processing a slice.");
+        if (!nextSliceSwitchEvent.scheduled()) {
+            schedule(nextSliceSwitchEvent, nextCycle());
         }
     }
 }
 
-void
-CenteralController::processNextMirrorMapReadEvent()
+int
+CenteralController::chooseNextSlice()
 {
-    // TODO: In future add functionality to align start_addr and end_addr to
-    // size of the vertex atom.
-    Addr start_addr = currentSliceNumber * totalSliceNumber * sizeof(int);
-    Addr end_addr = start_addr + totalSliceNumber * sizeof(int);
-    PacketPtr start = createReadPacket(start_addr, sizeof(int));
-    PointerTag* start_tag = new PointerTag(lastReadPacketId, PointerType::START);
-    start->pushSenderState(start_tag);
-    PacketPtr end = createReadPacket(end_addr, sizeof(int));
-    PointerTag* end_tag = new PointerTag(lastReadPacketId, PointerType::END);
-    end->pushSenderState(end_tag);
-    lastReadPacketId++;
-    mapPort.sendPacket(start);
-    mapPort.sendPacket(end);
+    int ret_slice_id = -1;
+    int max_pending_count = 0;
+    for (int i = 0; i < numTotalSlices; i++) {
+        if (numPendingUpdates[i] > max_pending_count) {
+            max_pending_count = numPendingUpdates[i];
+            ret_slice_id = i;
+        }
+    }
+    return ret_slice_id;
 }
 
-bool
-CenteralController::handleMemResp(PacketPtr pkt, PortID id)
+void
+CenteralController::processNextSliceSwitchEvent()
 {
-    assert(pkt->isResponse());
-    if (id == 0) {
-        if (pkt->isWrite()) {
-            delete pkt;
-            return true;
-        }
-        readQueue.push_back(pkt);
-        delete pkt;
-        if (!nextMirrorUpdateEvent.scheduled()) {
-            schedule(nextMirrorUpdateEvent, nextCycle());
-        }
-        return true;
-    } else if (id == 1) {
-        PointerTag* tag = pkt->findNextSenderState<PointerTag>();
-        int read_id = tag->Id();
-        PointerType read_type = tag->type();
-        if (read_type == PointerType::START) {
-            assert(startAddrs.find(read_id) == startAddrs.end());
-            startAddrs[read_id] = pkt->getLE<int>();
-            if (endAddrs.find(read_id) != endAddrs.end()) {
-                int vertex_atom = mpuVector.front()->vertexAtomSize();
-                mirrorPointerQueue.emplace_back(
-                    startAddrs[read_id], endAddrs[read_id],
-                    sizeof(MirrorVertex), vertex_atom);
-                if (!nextMirrorReadEvent.scheduled()) {
-                    schedule(nextMirrorReadEvent, nextCycle());
-                }
+    int vertex_atom = mpuVector.front()->vertexAtomSize();
+    int vertices_per_atom = (int) vertex_atom / sizeof(WorkListItem);
+    int bytes_accessed = 0;
+    int updates_generated_total =  0;
+    for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        int updates_generated = 0;
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<int>();
+        Addr end_addr = end->getLE<int>();
+        delete start;
+        delete end;
+        DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
+                            currentSliceId, dst_id, start_addr, end_addr);
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex mirrors [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+
+        WorkListItem vertices [vertices_per_atom];
+        for (int i = 0; i < num_mirrors; i++) {
+            Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+            Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
+            int wl_offset = (int) (org_addr - aligned_org_addr) / sizeof(WorkListItem);
+            PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
+            auto routing_entry = mpuAddrMap.contains(aligned_org_addr);
+            routing_entry->second->recvFunctional(read_org);
+            read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
+            delete read_org;
+             if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
+                assert(vertices[wl_offset].degree == 0);
+                vertices[wl_offset].prop = vertices[wl_offset].tempProp;
             }
-        } else {
-            assert(read_type == PointerType::END);
-            assert(endAddrs.find(read_id) == endAddrs.end());
-            endAddrs[read_id] = pkt->getLE<int>();
-            if (startAddrs.find(read_id) != startAddrs.end()) {
-                int vertex_atom = mpuVector.front()->vertexAtomSize();
-                mirrorPointerQueue.emplace_back(
-                    startAddrs[read_id], endAddrs[read_id],
-                    sizeof(MirrorVertex), vertex_atom);
-                if (!nextMirrorReadEvent.scheduled()) {
-                    schedule(nextMirrorReadEvent, nextCycle());
+            if (mirrors[i].prop != vertices[wl_offset].prop) {
+                mirrors[i].prop = vertices[wl_offset].prop;
+                if (!mirrors[i].activeNow) {
+                    mirrors[i].activeNow = true;
+                    numPendingUpdates[dst_id]++;
+                    totalUpdatesLeft++;
+                    updates_generated++;
                 }
+                bestPendingUpdate[dst_id] =
+                    workload->reduce(bestPendingUpdate[dst_id], mirrors[i].prop);
             }
         }
-        DPRINTF(CenteralController, "%s: Received pkt: %s from port %d "
-                                    "with value: %d.\n", __func__,
-                                    pkt->print(), id, pkt->getLE<int>());
-        delete tag;
-        delete pkt;
-        return true;
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        DPRINTF(CenteralController, "%s: Done scattering updates from slice "
+                        "%d to slice %d.\n", __func__, currentSliceId, dst_id);
+        DPRINTF(CenteralController, "%s: Generated %d updates from slice "
+                                        "%d to slice %d.\n", __func__,
+                                    updates_generated, currentSliceId, dst_id);
+        updates_generated_total += updates_generated;
+        bytes_accessed += 2 * num_bytes;
+    }
+    DPRINTF(CenteralController, "%s: Done with slice %d.\n", __func__, currentSliceId);
+    DPRINTF(CenteralController, "%s: Generated a total of %d updates.\n",
+                                        __func__, updates_generated_total);
+    DPRINTF(CenteralController, "%s: There are a total of %d "
+                                "updates left.\n", __func__, totalUpdatesLeft);
+    if (totalUpdatesLeft > 0) {
+        currentSliceId = chooseNextSlice();
     } else {
-        panic("did not expect this.");
+        exitSimLoopNow("Done with all the slices.");
+        return;
     }
-}
-
-void
-CenteralController::recvReqRetry(PortID id) {
-    if (id == 0) {
-        assert(!nextMirrorReadEvent.scheduled());
-        if (!mirrorPointerQueue.empty()) {
-            schedule(nextMirrorReadEvent, nextCycle());
+    DPRINTF(CenteralController, "%s: Chose %d as the "
+                                    "next slice.\n", __func__, currentSliceId);
+
+    for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
+        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        mapPort.sendFunctional(start);
+        mapPort.sendFunctional(end);
+        Addr start_addr = start->getLE<int>();
+        Addr end_addr = end->getLE<int>();
+        delete start;
+        delete end;
+
+        int num_bytes = end_addr - start_addr;
+        int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
+        MirrorVertex mirrors [num_mirrors];
+
+        PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
+        memPort.sendFunctional(read_mirrors);
+        read_mirrors->writeData((uint8_t*) mirrors);
+        delete read_mirrors;
+        for (int i = 0; i < num_mirrors; i++) {
+            if (mirrors[i].activeNow) {
+                Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
+                auto routing_entry = mpuAddrMap.contains(org_addr);
+                routing_entry->second->recvMirrorPush(org_addr, mirrors[i].prop,
+                                        mirrors[i].edgeIndex, mirrors[i].degree);
+                mirrors[i].activeNow = false;
+                numPendingUpdates[currentSliceId]--;
+                totalUpdatesLeft--;
+            }
         }
-    } else if (id == 1) {
-        DPRINTF(CenteralController, "%s: Ignoring reqRetry "
-                            "for port %d.\n", __func__, id);
-    } else {
-        panic("Did not expect the other.");
+        PacketPtr write_mirrors =
+                    createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
+        memPort.sendFunctional(write_mirrors);
+        delete write_mirrors;
+        DPRINTF(CenteralController, "%s: Done gathering updates from slice "
+                        "%d to slice %d.\n", __func__, src_id, currentSliceId);
+        bytes_accessed += num_bytes;
     }
-}
 
-void
-CenteralController::processNextMirrorReadEvent()
-{
-    Addr aligned_addr, offset;
-    int num_mirrors;
-
-    int vertex_atom = mpuVector.front()->vertexAtomSize();
-    MirrorReadInfoGen& front = mirrorPointerQueue.front();
-    std::tie(aligned_addr, offset, num_mirrors) = front.nextReadPacketInfo();
-    PacketPtr pkt = createReadPacket(aligned_addr, vertex_atom);
-    mirrorsPort.sendPacket(pkt);
-    front.iterate();
-    if (front.done()) {
-        mirrorPointerQueue.pop_front();
+    double mirror_mem_bw = mirrorsMem->getBW();
+    Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    for (auto mpu: mpuVector) {
+        mpu->startProcessingMirrors(time_to_switch);
     }
+    exitSimLoopNow("Done with slice switch.");
+}
 
-    if (!mirrorPointerQueue.empty() && !mirrorsPort.blocked()) {
-        schedule(nextMirrorReadEvent, nextCycle());
-    }
+bool
+CenteralController::handleMemResp(PacketPtr pkt)
+{
+    panic("handleMemResp should not be called at all");
 }
 
 void
-CenteralController::processNextMirrorUpdateEvent()
+CenteralController::recvMemRetry()
 {
-    int vertex_atom = mpuVector.front()->vertexAtomSize();
-
-    int num_mirrors_per_atom = vertex_atom / sizeof(MirrorVertex);
-    int num_vertices_per_atom = vertex_atom / sizeof(WorkListItem);
-    MirrorVertex mirrors[num_mirrors_per_atom];
-    WorkListItem vertices[num_vertices_per_atom];
-
-    PacketPtr front = readQueue.front();
-    front->writeDataToBlock((uint8_t*) mirrors, vertex_atom);
-    for (int i = 0; i < num_mirrors_per_atom; i++) {
-        Addr org_addr = mirrors[i].vertexId * sizeof(WorkListItem);
-        Addr aligned_org_addr = roundDown<Addr, int>(org_addr, vertex_atom);
-        int wl_offset = (org_addr - aligned_org_addr) / sizeof(WorkListItem);
-
-        PacketPtr read_org = createReadPacket(aligned_org_addr, vertex_atom);
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, org_addr)) {
-                mpu->recvFunctional(read_org);
-            }
-        }
-        read_org->writeDataToBlock((uint8_t*) vertices, vertex_atom);
-        DPRINTF(CenteralController, "%s: OG: %s, CP: %s.\n", __func__,
-            workload->printWorkListItem(vertices[wl_offset]), front.to_string());
-        delete read_org;
-
-        if (vertices[wl_offset].tempProp != vertices[wl_offset].prop) {
-            assert(data[wl_offset].degree == 0);
-            vertices[wl_offset].prop = vertices[wl_offset].tempProp;
-        }
-        if (mirrors[i].prop != vertices[wl_offset].prop) {
-            mirrors[i].prop = vertices[wl_offset].prop;
-            mirrors[i].activeNow = true;
-        }
-    }
-
-    PacketPtr wb = createWritePacket(
-                    front->getAddr(), front->getSize(), (uint8_t*) mirrors);
-    readQueue.pop_front();
-    delete front;
-
-    if (!nextWriteBackEvent.scheduled()) {
-        schedule(nextWriteBackEvent, nextCycle());
-    }
-    if (!readQueue.empty()) {
-        schedule(nextMirrorUpdateEvent, nextCycle());
-    }
+    panic("recvMemRetry should not be called at all");
 }
 
 void
-CenteralController::processNextWriteBackEvent()
+CenteralController::recvFunctional(PacketPtr pkt)
 {
-    PacketPtr front = writeBackQueue.front();
+    panic("recvFunctional should not be called at all");
 }
 
 int
@@ -466,12 +442,8 @@ CenteralController::printAnswerToHostSimout()
     for (Addr addr = 0; addr < maxVertexAddr; addr += vertex_atom)
     {
         PacketPtr pkt = createReadPacket(addr, vertex_atom);
-        for (auto mpu: mpuVector) {
-            AddrRangeList range_list = addrRangeListMap[mpu];
-            if (contains(range_list, addr)) {
-                mpu->recvFunctional(pkt);
-            }
-        }
+        auto routing_entry = mpuAddrMap.contains(pkt->getAddr());
+        routing_entry->second->recvFunctional(pkt);
         pkt->writeDataToBlock((uint8_t*) items, vertex_atom);
         for (int i = 0; i < num_items; i++) {
             std::string print = csprintf("WorkListItem[%lu][%d]: %s.", addr, i,
@@ -479,6 +451,7 @@ CenteralController::printAnswerToHostSimout()
 
             std::cout << print << std::endl;
         }
+        delete pkt;
     }
 }
 
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 60746c0c00..b76be8107b 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -29,23 +29,24 @@
 #ifndef __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 #define __ACCL_GRAPH_SEGA_CENTERAL_CONTROLLER_HH__
 
+#include <cmath>
 #include <vector>
 
 #include "accl/graph/base/data_structs.hh"
 #include "accl/graph/base/graph_workload.hh"
+#include "accl/graph/sega/base_memory_engine.hh"
 #include "accl/graph/sega/enums.hh"
 #include "accl/graph/sega/mpu.hh"
 #include "accl/graph/sega/router_engine.hh"
 #include "base/addr_range.hh"
 #include "base/intmath.hh"
+#include "mem/simple_mem.hh"
 #include "params/CenteralController.hh"
-#include "sim/clocked_object.hh"
-#include "sim/system.hh"
 
 namespace gem5
 {
 
-class CenteralController : public ClockedObject
+class CenteralController : public BaseMemoryEngine
 {
   private:
     class ReqPort : public RequestPort
@@ -68,110 +69,44 @@ class CenteralController : public ClockedObject
         virtual void recvReqRetry();
     };
 
-    struct PointerTag : public Packet::SenderState
-    {
-        int _id;
-        PointerType _type;
-        PointerTag(int id, PointerType type): _id(id), _type(type) {}
-        int Id() { return _id; }
-        PointerType type() { return _type; }
-
-    };
-
-    class MirrorReadInfoGen {
-      private:
-        Addr _start;
-        Addr _end;
-        size_t _step;
-        size_t _atom;
-
-      public:
-        MirrorReadInfoGen(Addr start, Addr end, size_t step, size_t atom):
-                        _start(start), _end(end), _step(step), _atom(atom)
-        {}
-
-        std::tuple<Addr, Addr, int> nextReadPacketInfo()
-        {
-            panic_if(done(), "Should not call nextPacketInfo when done.\n");
-            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
-            Addr offset = _start - aligned_addr;
-            int num_items = 0;
-
-            if (_end > (aligned_addr + _atom)) {
-                num_items = (_atom - offset) / _step;
-            } else {
-                num_items = (_end - _start) / _step;
-            }
-
-            return std::make_tuple(aligned_addr, offset, num_items);
-        }
-
-        void iterate()
-        {
-            panic_if(done(), "Should not call iterate when done.\n");
-            Addr aligned_addr = roundDown<Addr, Addr>(_start, _atom);
-            _start = aligned_addr + _atom;
-        }
-
-        bool done() { return (_start >= _end); }
-    };
-
-    System* system;
-
-    ReqPort mirrorsPort;
     ReqPort mapPort;
-
     Addr maxVertexAddr;
-
     ProcessingMode mode;
 
-    std::vector<MPU*> mpuVector;
-    std::vector<RouterEngine*> routerVector;
-
-    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
+    memory::SimpleMemory* mirrorsMem;
 
-    // FIXME: Initialize these two.
-    int currentSliceNumber;
-    int totalSliceNumber;
-
-    int lastReadPacketId;
-    std::unordered_map<int, Addr> startAddrs;
-    std::unordered_map<int, Addr> endAddrs;
-    // TODO: Set a max size for this queue;
-    std::deque<MirrorReadInfoGen> mirrorPointerQueue;
-
-    std::deque<PacketPtr> readQueue;
-    std::deque<PacketPtr> writeBackQueue;
-
-    int getSliceNumber(Addr vertex_addr);
-    PacketPtr createReadPacket(Addr addr, unsigned int size);
-    PacketPtr createWritePacket(Addr addr, unsigned int size, uint8_t* data);
+    std::vector<MPU*> mpuVector;
+    AddrRangeMap<MPU*> mpuAddrMap;
 
-    bool handleMemResp(PacketPtr pkt, PortID id);
-    void recvReqRetry(PortID id);
+    int currentSliceId;
+    int numTotalSlices;
+    int verticesPerSlice;
+    int totalUpdatesLeft;
 
-    EventFunctionWrapper nextMirrorMapReadEvent;
-    void processNextMirrorMapReadEvent();
+    int* numPendingUpdates;
+    uint32_t* bestPendingUpdate;
 
-    EventFunctionWrapper nextMirrorReadEvent;
-    void processNextMirrorReadEvent();
+    int chooseNextSlice();
 
-    EventFunctionWrapper nextMirrorUpdateEvent;
-    void processNextMirrorUpdateEvent();
+    EventFunctionWrapper nextSliceSwitchEvent;
+    void processNextSliceSwitchEvent();
 
-    EventFunctionWrapper nextWriteBackEvent;
-    void processNextWriteBackEvent();
+  protected:
+    virtual void recvMemRetry() override;
+    virtual bool handleMemResp(PacketPtr pkt) override;
 
   public:
     GraphWorkload* workload;
 
     PARAMS(CenteralController);
-    CenteralController(const CenteralControllerParams &params);
+    CenteralController(const Params& params);
     Port& getPort(const std::string& if_name,
                 PortID idx = InvalidPortID) override;
 
     virtual void startup() override;
 
+    virtual void recvFunctional(PacketPtr pkt) override;
+
     void setAsyncMode() { mode = ProcessingMode::ASYNCHRONOUS; }
     void setBSPMode() { mode = ProcessingMode::BULK_SYNCHRONOUS; }
     void setPGMode() { mode = ProcessingMode::POLY_GRAPH; }
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 083e8d4c37..9938034a88 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -759,7 +759,7 @@ CoalesceEngine::processNextMemoryEvent()
 {
     int num_transitions = 0;
     std::unordered_set<int> transitions;
-    FunctionDeque temp_deque;
+    MemoryFunctionDeque temp_deque;
     temp_deque.clear();
 
     while (true) {
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 4066c7dbe5..9de401cf81 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -43,7 +43,7 @@
 namespace gem5
 {
 
-typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> FunctionDeque;
+typedef std::deque<std::tuple<std::function<void(int, Tick)>, int, Tick>> MemoryFunctionDeque;
 
 class MPU;
 
@@ -144,7 +144,7 @@ class CoalesceEngine : public BaseMemoryEngine
     int getBlockIndex(Addr addr);
 
     int transitionsPerCycle;
-    FunctionDeque memAccBuffer;
+    MemoryFunctionDeque memAccBuffer;
 
     MemoryEvent nextMemoryEvent;
     void processNextMemoryEvent();
@@ -217,7 +217,7 @@ class CoalesceEngine : public BaseMemoryEngine
     void createBSPPopCountDirectory(int atoms_per_block);
     void recvWorkload(GraphWorkload* workload) { graphWorkload = workload; }
 
-    virtual void recvFunctional(PacketPtr pkt);
+    virtual void recvFunctional(PacketPtr pkt) override;
     void postMemInitSetup();
     void postConsumeProcess();
     void swapDirectories();
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index 5b8de3404f..c85c60fd8d 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -66,6 +66,6 @@ const char* processingModeStrings[NUM_PROCESSING_MODE] =
     "BULK_SYNCHRONOUS"
 };
 
-const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"N/A", "START", "END"};
+const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"START", "END"};
 
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 92e293bec0..194fdc2140 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -79,9 +79,16 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum PointerType
+enum PGMode
 {
     NA,
+    SCATTER,
+    GATHER,
+    NUM_PG_MODE
+};
+
+enum PointerType
+{
     START,
     END,
     NUM_POINTER_TYPE
diff --git a/src/accl/graph/sega/mpu.cc b/src/accl/graph/sega/mpu.cc
index f661bd68a6..a5063cf685 100644
--- a/src/accl/graph/sega/mpu.cc
+++ b/src/accl/graph/sega/mpu.cc
@@ -87,6 +87,13 @@ MPU::recvVertexPush(Addr addr, uint32_t delta,
     pushEngine->recvVertexPush(addr, delta, edge_index, degree);
 }
 
+void
+MPU::recvMirrorPush(Addr addr, uint32_t delta,
+                    uint32_t edge_index, uint32_t degree)
+{
+    pushEngine->recvMirrorPush(addr, delta, edge_index, degree);
+}
+
 void
 MPU::recvDoneSignal()
 {
diff --git a/src/accl/graph/sega/mpu.hh b/src/accl/graph/sega/mpu.hh
index 95d3adeca5..4afb2081ca 100644
--- a/src/accl/graph/sega/mpu.hh
+++ b/src/accl/graph/sega/mpu.hh
@@ -69,6 +69,7 @@ class MPU : public SimObject
 
     unsigned int vertexAtomSize() { return coalesceEngine->params().attached_memory_atom_size; }
     AddrRangeList getAddrRanges() { return coalesceEngine->getAddrRanges(); }
+    uint64_t getCacheSize() { return coalesceEngine->params().cache_size; }
     void recvFunctional(PacketPtr pkt) { coalesceEngine->recvFunctional(pkt); }
     void postMemInitSetup() { coalesceEngine->postMemInitSetup(); }
     void postConsumeProcess() { coalesceEngine->postConsumeProcess(); }
@@ -88,6 +89,10 @@ class MPU : public SimObject
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
 
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait) { pushEngine->startProcessingMirrors(time_to_wait); }
+
     void recvDoneSignal();
     bool done();
 };
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 893643c510..70470a512d 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -218,6 +218,30 @@ PushEngine::recvVertexPush(Addr addr, uint32_t delta,
     }
 }
 
+void
+PushEngine::recvMirrorPush(Addr addr, uint32_t delta,
+                            uint32_t edge_index, uint32_t degree)
+{
+    Addr start_addr = edge_index * sizeof(Edge);
+    Addr end_addr = start_addr + (degree * sizeof(Edge));
+    EdgeReadInfoGen info_gen(addr, delta, start_addr, end_addr,
+                            sizeof(Edge), peerMemoryAtomSize);
+
+    edgePointerQueue.emplace_back(info_gen, curTick());
+    stats.edgePointerQueueLength.sample(edgePointerQueue.size());
+}
+
+void
+PushEngine::startProcessingMirrors(Tick time_to_wait)
+{
+    assert(!nextMemoryReadEvent.pending());
+    assert(!nextMemoryReadEvent.scheduled());
+    Cycles wait = ticksToCycles(time_to_wait);
+    if (!edgePointerQueue.empty()) {
+        schedule(nextMemoryReadEvent, clockEdge(wait));
+    }
+}
+
 void
 PushEngine::processNextMemoryReadEvent()
 {
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 08a5d278f5..2aced4b156 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -209,6 +209,9 @@ class PushEngine : public BaseMemoryEngine
     bool running() { return _running; }
     void recvVertexPush(Addr addr, uint32_t delta,
                         uint32_t edge_index, uint32_t degree);
+    void recvMirrorPush(Addr addr, uint32_t delta,
+                        uint32_t edge_index, uint32_t degree);
+    void startProcessingMirrors(Tick time_to_wait);
 
     void recvReqRetry();
 
diff --git a/src/mem/simple_mem.hh b/src/mem/simple_mem.hh
index fc6d6849d5..f57ef33629 100644
--- a/src/mem/simple_mem.hh
+++ b/src/mem/simple_mem.hh
@@ -178,7 +178,6 @@ class SimpleMemory : public AbstractMemory
     std::unique_ptr<Packet> pendingDelete;
 
   public:
-
     SimpleMemory(const SimpleMemoryParams &p);
 
     DrainState drain() override;
@@ -187,6 +186,8 @@ class SimpleMemory : public AbstractMemory
                   PortID idx=InvalidPortID) override;
     void init() override;
 
+    double getBW() { return bandwidth; }
+
   protected:
     Tick recvAtomic(PacketPtr pkt);
     Tick recvAtomicBackdoor(PacketPtr pkt, MemBackdoorPtr &_backdoor);

From 2742130eb5163e64fa14dd668db5c2368acd27b3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Apr 2023 01:56:31 -0700
Subject: [PATCH 275/287] Cleaning up and merging temp partition.

---
 configs/accl/sega.py                       | 107 ++++++++++++--------
 configs/accl/sega_simple.py                | 109 ++++++++++++++++-----
 src/accl/graph/sega/CenteralController.py  |   3 +
 src/accl/graph/sega/PushEngine.py          |   3 +
 src/accl/graph/sega/WLEngine.py            |   6 +-
 src/accl/graph/sega/centeral_controller.cc |  20 +++-
 src/accl/graph/sega/centeral_controller.hh |   2 +-
 src/accl/graph/sega/enums.cc               |  10 --
 src/accl/graph/sega/enums.hh               |  16 ---
 src/accl/graph/sega/push_engine.cc         |   3 +-
 src/accl/graph/sega/push_engine.hh         |   1 +
 src/accl/graph/sega/wl_engine.cc           |   4 +-
 src/accl/graph/sega/wl_engine.hh           |   3 +-
 13 files changed, 183 insertions(+), 104 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 982235697a..e73a6d1843 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -52,10 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
+            examine_window=8,
             rd_per_cycle=4,
             reduce_per_cycle=32,
             wr_per_cycle=4,
-            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -69,9 +69,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=4096,
+            resp_queue_size=1024,
+            examine_window=12,
             max_propagates_per_cycle=8,
-            update_queue_size=32,
+            update_queue_size=64,
         )
 
         self.vertex_mem_ctrl = HBMCtrl(
@@ -137,6 +138,43 @@ def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
 
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="4GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controlller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+
+
 class SEGA(System):
     def __init__(
         self,
@@ -156,30 +194,9 @@ def __init__(
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        # Building the CenteralController
-        self.mirror_mem = SimpleMemory(
-            latency="90ns",
-            latency_var="0ns",
-            bandwidth="256GiB/s",
-            image_file=f"{graph_path}/mirrors",
-            range=AddrRange(start=0, size="4GiB"),
-            in_addr_map=False,
-        )
-        self.map_mem = SimpleMemory(
-            latency="0ns",
-            latency_var="0ns",
-            bandwidth="1024GiB/s",
-            image_file=f"{graph_path}/mirrors_map",
-            range=AddrRange(start=0, size="4GiB"),
-            in_addr_map=False,
-        )
-        self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices",
-            mem_port=self.mirror_mem.port,
-            mirrors_map_mem=self.map_mem.port,
-            mirrors_mem=self.mirror_mem
-        )
-        # Building the EdgeMemories
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
         edge_mem = []
         for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
@@ -207,46 +224,52 @@ def __init__(
                 gpt_0.setReqPort(gpt_1.getRespPort())
         self.gpts = gpts
 
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
 
     def work_count(self):
-        return self.ctrl.workCount()
+        return self.ctrl.controller.workCount()
 
     def set_async_mode(self):
-        self.ctrl.setAsyncMode()
+        self.ctrl.controller.setAsyncMode()
 
     def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
+        self.ctrl.controller.setBSPMode()
 
     def set_pg_mode(self):
-        self.ctrl.setPGMode()
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
 
     def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
 
     def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
 
     def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
 
     def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
+        self.ctrl.controller.createCCWorkload()
 
     def create_async_pr_workload(self, alpha, threshold):
-        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
 
     def create_pr_workload(self, num_nodes, alpha):
-        self.ctrl.createPRWorkload(num_nodes, alpha)
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
 
     def get_pr_error(self):
-        return self.ctrl.getPRError()
+        return self.ctrl.controller.getPRError()
 
     def create_bc_workload(self, init_addr, init_value):
-        self.ctrl.createBCWorkload(init_addr, init_value)
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index b389d7e3e7..e1b0aa1fab 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -52,9 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
-            rd_per_cycle=2,
+            examine_window=8,
+            rd_per_cycle=4,
             reduce_per_cycle=32,
-            wr_per_cycle=2,
+            wr_per_cycle=4,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -63,17 +64,19 @@ def __init__(self, register_file_size: int, cache_size: str):
             pending_pull_limit=64,
             active_buffer_size=80,
             post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
         )
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=4096,
+            resp_queue_size=1024,
+            examine_window=12,
             max_propagates_per_cycle=8,
-            update_queue_size=32,
+            update_queue_size=64,
         )
 
         self.vertex_mem_ctrl = SimpleMemory(
-            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            latency="120ns", bandwidth="28GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -130,11 +133,57 @@ def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
 
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="4GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controlller.mem_port = self.controlller.mirrors_mem.port
+        self.controlller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controlller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controlller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controlller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controlller.mpu_vector = mpu_vector
+
+
 class SEGA(System):
+<<<<<<< HEAD
     def __init__(self, num_gpts, num_registers, cache_size,
                                         r_queue_size, r_latency, graph_path):
+=======
+    def __init__(
+        self,
+        num_gpts,
+        num_registers,
+        cache_size,
+        graph_path,
+    ):
+>>>>>>> Cleaning up and merging temp partition.
         super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
         assert num_gpts != 0
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
@@ -145,11 +194,9 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-        # Building the CenteralController
-        self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices"
-        )
-        # Building the EdgeMemories
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
+
         edge_mem = []
         for i in range(int(num_gpts / 2)):
             mem = EdgeMemory("4GiB")
@@ -163,7 +210,9 @@ def __init__(self, num_gpts, num_registers, cache_size,
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
+            gpt.set_vertex_range(
+                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
+            )
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -178,37 +227,49 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.ctrl.router_vector = []
 
     def work_count(self):
-        return self.ctrl.workCount()
+        return self.ctrl.controller.workCount()
 
     def set_async_mode(self):
-        self.ctrl.setAsyncMode()
+        self.ctrl.controller.setAsyncMode()
 
     def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
 
     def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
 
     def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
 
     def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
 
     def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
+        self.ctrl.controller.createCCWorkload()
 
     def create_async_pr_workload(self, alpha, threshold):
-        self.ctrl.createAsyncPRWorkload(alpha, threshold)
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
 
     def create_pr_workload(self, num_nodes, alpha):
-        self.ctrl.createPRWorkload(num_nodes, alpha)
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
 
     def create_bc_workload(self, init_addr, init_value):
-        self.ctrl.createBCWorkload(init_addr, init_value)
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/src/accl/graph/sega/CenteralController.py b/src/accl/graph/sega/CenteralController.py
index cff7e8a036..619e76f1ee 100644
--- a/src/accl/graph/sega/CenteralController.py
+++ b/src/accl/graph/sega/CenteralController.py
@@ -37,6 +37,9 @@ class CenteralController(BaseMemoryEngine):
 
     mirrors_map_mem = RequestPort("Port to a memory storing mirrors map file.")
 
+    choose_best = Param.Bool("Whether to prefer the best update "
+                            "value for choosing the next slice")
+
     vertex_image_file = Param.String("Path to the vertex image file.")
 
     mirrors_mem = Param.SimpleMemory("Memory to store the vertex mirrors.")
diff --git a/src/accl/graph/sega/PushEngine.py b/src/accl/graph/sega/PushEngine.py
index 63fa1eae62..2174f943f4 100644
--- a/src/accl/graph/sega/PushEngine.py
+++ b/src/accl/graph/sega/PushEngine.py
@@ -42,6 +42,9 @@ class PushEngine(BaseMemoryEngine):
                                     "push engine where it stores the "
                                     "edges read from memory.")
 
+    examine_window = Param.Int("Number of edges at the front of the edge queue"
+                                " to examine in order to propagate.")
+
     max_propagates_per_cycle = Param.Int("Maximum number of propagates "
                                                         "done per cycle.")
 
diff --git a/src/accl/graph/sega/WLEngine.py b/src/accl/graph/sega/WLEngine.py
index cfec70081d..f9ea4488df 100644
--- a/src/accl/graph/sega/WLEngine.py
+++ b/src/accl/graph/sega/WLEngine.py
@@ -45,8 +45,10 @@ class WLEngine(BaseReduceEngine):
                                     "many updates as this queueu has "
                                     "entries at the same time.")
 
+    examine_window = Param.Int("Number of updates at the front of update "
+                                "queue examined for reading.")
     rd_per_cycle = Param.Int("Maximum number of reads per cycle.")
     reduce_per_cycle = Param.Int("Maximum number of reduce per cycle.")
     wr_per_cycle = Param.Int("Maximum number of writes per cycle.")
-    
-    num_updates_processed = Param.Int("Maximum number of updates processed")
+
+
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index fdbcfe2838..0b36b5e067 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -45,6 +45,7 @@ CenteralController::CenteralController(const Params& params):
     BaseMemoryEngine(params),
     mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
     mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
+    chooseBest(params.choose_best),
     nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
 {
     uint64_t total_cache_size = 0;
@@ -256,15 +257,26 @@ CenteralController::recvDoneSignal()
 int
 CenteralController::chooseNextSlice()
 {
-    int ret_slice_id = -1;
+    int crowded_slice_id = -1;
     int max_pending_count = 0;
+    // TODO: Make this general for all workloads
+    uint32_t best_update = -1;
+    int best_slice_id = -1;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
-            ret_slice_id = i;
+            crowded_slice_id = i;
         }
+        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+            best_update = bestPendingUpdate[i];
+            best_slice_id = i;
+        }
+    }
+    if (chooseBest) {
+        return best_slice_id;
+    } else {
+        return crowded_slice_id;
     }
-    return ret_slice_id;
 }
 
 void
@@ -321,7 +333,7 @@ CenteralController::processNextSliceSwitchEvent()
                     updates_generated++;
                 }
                 bestPendingUpdate[dst_id] =
-                    workload->reduce(bestPendingUpdate[dst_id], mirrors[i].prop);
+                    workload->betterThan(mirrors[i].prop, bestPendingUpdate[dst_id]);
             }
         }
         PacketPtr write_mirrors =
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index b76be8107b..52d6f5d966 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -83,9 +83,9 @@ class CenteralController : public BaseMemoryEngine
     int verticesPerSlice;
     int totalUpdatesLeft;
 
+    bool chooseBest;
     int* numPendingUpdates;
     uint32_t* bestPendingUpdate;
-
     int chooseNextSlice();
 
     EventFunctionWrapper nextSliceSwitchEvent;
diff --git a/src/accl/graph/sega/enums.cc b/src/accl/graph/sega/enums.cc
index c85c60fd8d..ba57b387f4 100644
--- a/src/accl/graph/sega/enums.cc
+++ b/src/accl/graph/sega/enums.cc
@@ -58,14 +58,4 @@ const char* readDestinationStrings[NUM_READ_DESTINATION] =
     "READ_FOR_PUSH"
 };
 
-const char* processingModeStrings[NUM_PROCESSING_MODE] =
-{
-    "NOT_SET",
-    "ASYNCHRONOUS",
-    "POLY_GRAPH",
-    "BULK_SYNCHRONOUS"
-};
-
-const char* pointerTypeStrings[NUM_POINTER_TYPE] = {"START", "END"};
-
 } // namespace gem5
diff --git a/src/accl/graph/sega/enums.hh b/src/accl/graph/sega/enums.hh
index 194fdc2140..0f654c5386 100644
--- a/src/accl/graph/sega/enums.hh
+++ b/src/accl/graph/sega/enums.hh
@@ -79,22 +79,6 @@ enum ProcessingMode
 };
 extern const char* processingModeStrings[NUM_PROCESSING_MODE];
 
-enum PGMode
-{
-    NA,
-    SCATTER,
-    GATHER,
-    NUM_PG_MODE
-};
-
-enum PointerType
-{
-    START,
-    END,
-    NUM_POINTER_TYPE
-};
-extern const char* pointerTypeStrings[NUM_POINTER_TYPE];
-
 } // namespace gem5
 
 #endif // __ACCL_GRAPH_SEGA_ENUMS_HH__
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index 70470a512d..b6fdf54f13 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -43,6 +43,7 @@ PushEngine::PushEngine(const Params& params):
     lastIdleEntranceTick(0),
     numPendingPulls(0), edgePointerQueueSize(params.push_req_queue_size),
     onTheFlyMemReqs(0), edgeQueueSize(params.resp_queue_size),
+    examineWindow(params.examine_window),
     maxPropagatesPerCycle(params.max_propagates_per_cycle),
     updateQueueSize(params.update_queue_size),
     nextVertexPullEvent([this] { processNextVertexPullEvent(); }, name()),
@@ -341,7 +342,7 @@ PushEngine::processNextPropagateEvent()
     int num_tries = 0;
     int num_reads = 0;
     std::deque<std::tuple<MetaEdge, Tick>> temp_edge;
-    for (int i = 0; i < maxPropagatesPerCycle; i++) {
+    for (int i = 0; i < examineWindow; i++) {
         if (metaEdgeQueue.empty()) {
             break;
         }
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 2aced4b156..0108a2d7ef 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -136,6 +136,7 @@ class PushEngine : public BaseMemoryEngine
 
     int onTheFlyMemReqs;
     int edgeQueueSize;
+    int examineWindow;
     int maxPropagatesPerCycle;
     std::deque<std::tuple<MetaEdge, Tick>> metaEdgeQueue;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index c294441703..0b64e09d67 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -44,10 +44,10 @@ namespace gem5
 WLEngine::WLEngine(const WLEngineParams& params):
     BaseReduceEngine(params),
     updateQueueSize(params.update_queue_size),
+    examineWindow(params.examine_window),
     maxReadsPerCycle(params.rd_per_cycle),
     maxReducesPerCycle(params.reduce_per_cycle),
     maxWritesPerCycle(params.wr_per_cycle),
-    maxUpdatesProcessed(params.num_updates_processed),
     registerFileSize(params.register_file_size),
     nextReadEvent([this]{ processNextReadEvent(); }, name()),
     nextReduceEvent([this]{ processNextReduceEvent(); }, name()),
@@ -211,7 +211,7 @@ void
 WLEngine::processNextReadEvent()
 {
     std::deque<std::tuple<Addr, Tick>> temp_queue;
-    for (int i = 0; i < maxUpdatesProcessed; i++) {
+    for (int i = 0; i < examineWindow; i++) {
         if (updateQueue.empty()) {
             break;
         }
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index bb8e82f501..2c08e4e273 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -80,12 +80,11 @@ class WLEngine : public BaseReduceEngine
     std::deque<std::tuple<Addr, Tick>> updateQueue;
     std::unordered_map<Addr, uint32_t> valueMap;
 
+    int examineWindow;
     int maxReadsPerCycle;
     int maxReducesPerCycle;
     int maxWritesPerCycle;
 
-    int maxUpdatesProcessed;
-
     int registerFileSize;
     std::unordered_map<Addr, std::tuple<RegisterState, uint32_t>> registerFile;
     std::unordered_map<Addr, WorkListItem> workListFile;

From 811189d7c751694f5d8af160a89d1241f9f1db13 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Wed, 12 Apr 2023 08:24:59 -0700
Subject: [PATCH 276/287] Updating config scripts.

---
 configs/accl/bfs.py         | 31 +++++++++++++++++++++++++++++--
 configs/accl/sega.py        |  2 +-
 configs/accl/sega_simple.py | 30 +++++++++---------------------
 3 files changed, 39 insertions(+), 24 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 8221badd9d..1a58a5fc41 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -41,6 +41,22 @@ def get_inputs():
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument(
+        "--tile",
+        dest="tile",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use temporal partitioning",
+    )
+    argparser.add_argument(
+        "--best",
+        dest="best",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Whether to use best update value for switching slices",
+    )
     argparser.add_argument(
         "--visited",
         dest="visited",
@@ -93,6 +109,8 @@ def get_inputs():
         args.graph,
         args.init_addr,
         args.init_value,
+        args.tile,
+        args.best,
         args.visited,
         args.simple,
         args.pt2pt,
@@ -111,6 +129,8 @@ def get_inputs():
         graph,
         init_addr,
         init_value,
+        tile,
+        best,
         visited,
         simple,
         pt2pt,
@@ -131,8 +151,11 @@ def get_inputs():
 
     m5.instantiate()
 
-    # system.set_async_mode()
-    system.set_pg_mode()
+    if tile:
+        system.set_pg_mode()
+    else:
+        system.set_async_mode()
+
     system.create_pop_count_directory(64)
     if visited:
         system.create_bfs_visited_workload(init_addr, init_value)
@@ -150,6 +173,8 @@ def get_inputs():
                 m5.stats.reset()
             elif exit_event.getCause() == "Done with all the slices.":
                 break
+            elif exit_event.getCause() == "no update left to process.":
+                break
     else:
         while True:
             exit_event = m5.simulate()
@@ -159,5 +184,7 @@ def get_inputs():
             )
             if exit_event.getCause() == "Done with all the slices.":
                 break
+            if exit_event.getCause() == "no update left to process.":
+                break
     if verify:
         system.print_answer()
diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index e73a6d1843..2df36fab20 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -162,7 +162,7 @@ def __init__(self, mirror_bw):
         self.controller.mirrors_map_mem = self.map_mem.port
 
     def set_choose_best(self, choose_best):
-        self.controlller.choose_best = choose_best
+        self.controller.choose_best = choose_best
 
     def set_vertices_image(self, vertices):
         self.controller.vertex_image_file = vertices
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index e1b0aa1fab..bdecd24c2f 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -76,7 +76,7 @@ def __init__(self, register_file_size: int, cache_size: str):
         )
 
         self.vertex_mem_ctrl = SimpleMemory(
-            latency="120ns", bandwidth="28GiB/s"
+            latency="120ns", bandwidth="256GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -153,36 +153,26 @@ def __init__(self, mirror_bw):
                 in_addr_map=False,
             ),
         )
-        self.controlller.mem_port = self.controlller.mirrors_mem.port
-        self.controlller.mirrors_map_mem = self.map_mem.port
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
 
     def set_choose_best(self, choose_best):
-        self.controlller.choose_best = choose_best
+        self.controller.choose_best = choose_best
 
     def set_vertices_image(self, vertices):
-        self.controlller.vertex_image_file = vertices
+        self.controller.vertex_image_file = vertices
 
     def set_aux_images(self, mirrors, mirrors_map):
-        self.controlller.mirrors_mem.image_file = mirrors
+        self.controller.mirrors_mem.image_file = mirrors
         self.map_mem.image_file = mirrors_map
 
     def set_mpu_vector(self, mpu_vector):
-        self.controlller.mpu_vector = mpu_vector
+        self.controller.mpu_vector = mpu_vector
 
 
 class SEGA(System):
-<<<<<<< HEAD
     def __init__(self, num_gpts, num_registers, cache_size,
                                         r_queue_size, r_latency, graph_path):
-=======
-    def __init__(
-        self,
-        num_gpts,
-        num_registers,
-        cache_size,
-        graph_path,
-    ):
->>>>>>> Cleaning up and merging temp partition.
         super(SEGA, self).__init__()
         assert num_gpts != 0
         assert num_gpts % 2 == 0
@@ -210,9 +200,7 @@ def __init__(
         gpts = []
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
-            gpt.set_vertex_range(
-                [vertex_ranges[i], vertex_ranges[i + num_gpts]]
-            )
+            gpt.set_vertex_range(vertex_ranges[i])
             gpt.setEdgeMemPort(
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
@@ -239,7 +227,7 @@ def set_pg_mode(self):
         self.ctrl.controller.setPGMode()
 
     def set_aux_images(self, mirrors, mirrors_map):
-        self.ctrl.set_images(mirrors, mirrors_map)
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
 
     def set_choose_best(self, choose_best):
         self.ctrl.set_choose_best(choose_best)

From e417443dcbcd40815eb73f08933a563044d8beb3 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:14:22 -0700
Subject: [PATCH 277/287] Adding stats to centeral controller.

---
 src/accl/graph/sega/centeral_controller.cc | 44 ++++++++++++++++++++--
 src/accl/graph/sega/centeral_controller.hh | 15 ++++++++
 src/accl/graph/sega/coalesce_engine.cc     |  6 +--
 src/accl/graph/sega/coalesce_engine.hh     |  2 +-
 src/accl/graph/sega/push_engine.cc         |  5 +--
 src/accl/graph/sega/push_engine.hh         |  2 +-
 src/accl/graph/sega/wl_engine.cc           |  5 +--
 src/accl/graph/sega/wl_engine.hh           |  2 +-
 8 files changed, 64 insertions(+), 17 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 0b36b5e067..488281fe84 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -46,7 +46,8 @@ CenteralController::CenteralController(const Params& params):
     mapPort("map_port", this, 1), mode(ProcessingMode::NOT_SET),
     mirrorsMem(params.mirrors_mem), currentSliceId(0), totalUpdatesLeft(0),
     chooseBest(params.choose_best),
-    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name())
+    nextSliceSwitchEvent([this] { processNextSliceSwitchEvent(); }, name()),
+    stats(*this)
 {
     uint64_t total_cache_size = 0;
     for (auto mpu : params.mpu_vector) {
@@ -262,13 +263,16 @@ CenteralController::chooseNextSlice()
     // TODO: Make this general for all workloads
     uint32_t best_update = -1;
     int best_slice_id = -1;
+    int max_best_pending_count = 0;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
             crowded_slice_id = i;
         }
-        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (numPendingUpdates[i] > max_best_pending_count &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
+            max_best_pending_count = numPendingUpdates[i];
             best_slice_id = i;
         }
     }
@@ -287,6 +291,9 @@ CenteralController::processNextSliceSwitchEvent()
     int bytes_accessed = 0;
     int updates_generated_total =  0;
     for (int dst_id = 0; dst_id < numTotalSlices; dst_id++) {
+        if (dst_id == currentSliceId) {
+            continue;
+        }
         int updates_generated = 0;
         Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
         Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
@@ -303,7 +310,7 @@ CenteralController::processNextSliceSwitchEvent()
 
         int num_bytes = end_addr - start_addr;
         int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
-        MirrorVertex mirrors [num_mirrors];
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
         memPort.sendFunctional(read_mirrors);
@@ -340,6 +347,7 @@ CenteralController::processNextSliceSwitchEvent()
                     createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
         memPort.sendFunctional(write_mirrors);
         delete write_mirrors;
+        delete [] mirrors;
         DPRINTF(CenteralController, "%s: Done scattering updates from slice "
                         "%d to slice %d.\n", __func__, currentSliceId, dst_id);
         DPRINTF(CenteralController, "%s: Generated %d updates from slice "
@@ -363,6 +371,9 @@ CenteralController::processNextSliceSwitchEvent()
                                     "next slice.\n", __func__, currentSliceId);
 
     for (int src_id = 0; src_id < numTotalSlices; src_id++) {
+        if (src_id == currentSliceId) {
+            continue;
+        }
         Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
         Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
@@ -376,7 +387,7 @@ CenteralController::processNextSliceSwitchEvent()
 
         int num_bytes = end_addr - start_addr;
         int num_mirrors = (int) (end_addr - start_addr) / sizeof(MirrorVertex);
-        MirrorVertex mirrors [num_mirrors];
+        MirrorVertex* mirrors = new MirrorVertex [num_mirrors];
 
         PacketPtr read_mirrors = createReadPacket(start_addr, num_bytes);
         memPort.sendFunctional(read_mirrors);
@@ -397,6 +408,7 @@ CenteralController::processNextSliceSwitchEvent()
                     createWritePacket(start_addr, num_bytes, (uint8_t*) mirrors);
         memPort.sendFunctional(write_mirrors);
         delete write_mirrors;
+        delete [] mirrors;
         DPRINTF(CenteralController, "%s: Done gathering updates from slice "
                         "%d to slice %d.\n", __func__, src_id, currentSliceId);
         bytes_accessed += num_bytes;
@@ -404,6 +416,9 @@ CenteralController::processNextSliceSwitchEvent()
 
     double mirror_mem_bw = mirrorsMem->getBW();
     Tick time_to_switch = bytes_accessed * mirror_mem_bw;
+    stats.switchTicks += time_to_switch;
+    stats.switchedBytes += bytes_accessed;
+    stats.numSwitches++;
     for (auto mpu: mpuVector) {
         mpu->startProcessingMirrors(time_to_switch);
     }
@@ -467,4 +482,25 @@ CenteralController::printAnswerToHostSimout()
     }
 }
 
+CenteralController::ControllerStats::ControllerStats(CenteralController& _ctrl):
+    statistics::Group(&_ctrl), ctrl(_ctrl),
+    ADD_STAT(numSwitches, statistics::units::Byte::get(),
+             "Number of slices switches completed."),
+    ADD_STAT(switchedBytes, statistics::units::Byte::get(),
+             "Number of bytes accessed during slice switching."),
+    ADD_STAT(switchTicks, statistics::units::Tick::get(),
+             "Number of ticks spent switching slices."),
+    ADD_STAT(switchSeconds, statistics::units::Second::get(),
+             "Traversed Edges Per Second.")
+{
+}
+
+void
+CenteralController::ControllerStats::regStats()
+{
+    using namespace statistics;
+
+    switchSeconds = switchTicks / simFreq;
+}
+
 }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 52d6f5d966..883437a202 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -91,6 +91,21 @@ class CenteralController : public BaseMemoryEngine
     EventFunctionWrapper nextSliceSwitchEvent;
     void processNextSliceSwitchEvent();
 
+    struct ControllerStats : public statistics::Group
+    {
+      ControllerStats(CenteralController& ctrl);
+
+      void regStats() override;
+
+      CenteralController& ctrl;
+
+      statistics::Scalar numSwitches;
+      statistics::Scalar switchedBytes;
+      statistics::Scalar switchTicks;
+      statistics::Formula switchSeconds;
+    };
+    ControllerStats stats;
+
   protected:
     virtual void recvMemRetry() override;
     virtual bool handleMemResp(PacketPtr pkt) override;
diff --git a/src/accl/graph/sega/coalesce_engine.cc b/src/accl/graph/sega/coalesce_engine.cc
index 9938034a88..5e0c8c8095 100644
--- a/src/accl/graph/sega/coalesce_engine.cc
+++ b/src/accl/graph/sega/coalesce_engine.cc
@@ -1237,10 +1237,8 @@ CoalesceEngine::processNextDoneSignalEvent()
     }
 }
 
-CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine &_coalesce)
-    : statistics::Group(&_coalesce),
-    coalesce(_coalesce),
-    lastResetTick(0),
+CoalesceEngine::CoalesceStats::CoalesceStats(CoalesceEngine& _coalesce):
+    statistics::Group(&_coalesce), coalesce(_coalesce), lastResetTick(0),
     ADD_STAT(numVertexReads, statistics::units::Count::get(),
              "Number of memory vertecies read from cache."),
     ADD_STAT(numVertexWrites, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/coalesce_engine.hh b/src/accl/graph/sega/coalesce_engine.hh
index 9de401cf81..3a9e463595 100644
--- a/src/accl/graph/sega/coalesce_engine.hh
+++ b/src/accl/graph/sega/coalesce_engine.hh
@@ -164,7 +164,7 @@ class CoalesceEngine : public BaseMemoryEngine
 
     struct CoalesceStats : public statistics::Group
     {
-        CoalesceStats(CoalesceEngine &coalesce);
+        CoalesceStats(CoalesceEngine& coalesce);
 
         virtual void regStats() override;
 
diff --git a/src/accl/graph/sega/push_engine.cc b/src/accl/graph/sega/push_engine.cc
index b6fdf54f13..6040989070 100644
--- a/src/accl/graph/sega/push_engine.cc
+++ b/src/accl/graph/sega/push_engine.cc
@@ -515,9 +515,8 @@ PushEngine::processNextUpdatePushEvent()
     }
 }
 
-PushEngine::PushStats::PushStats(PushEngine &_push)
-    : statistics::Group(&_push),
-    push(_push),
+PushEngine::PushStats::PushStats(PushEngine& _push):
+    statistics::Group(&_push), push(_push),
     ADD_STAT(numPropagates, statistics::units::Count::get(),
              "Number of propagate operations done."),
     ADD_STAT(updateQueueFull, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/push_engine.hh b/src/accl/graph/sega/push_engine.hh
index 0108a2d7ef..7170d2d22e 100644
--- a/src/accl/graph/sega/push_engine.hh
+++ b/src/accl/graph/sega/push_engine.hh
@@ -165,7 +165,7 @@ class PushEngine : public BaseMemoryEngine
 
     struct PushStats : public statistics::Group
     {
-      PushStats(PushEngine &push);
+      PushStats(PushEngine& push);
 
       void regStats() override;
 
diff --git a/src/accl/graph/sega/wl_engine.cc b/src/accl/graph/sega/wl_engine.cc
index 0b64e09d67..8e5ccc9ebe 100644
--- a/src/accl/graph/sega/wl_engine.cc
+++ b/src/accl/graph/sega/wl_engine.cc
@@ -457,9 +457,8 @@ WLEngine::processNextDoneSignalEvent()
     }
 }
 
-WLEngine::WorkListStats::WorkListStats(WLEngine &_wl)
-    : statistics::Group(&_wl),
-    wl(_wl),
+WLEngine::WorkListStats::WorkListStats(WLEngine& _wl):
+    statistics::Group(&_wl), wl(_wl),
     ADD_STAT(updateQueueCoalescions, statistics::units::Count::get(),
              "Number of coalescions in the update queues."),
     ADD_STAT(registerShortage, statistics::units::Count::get(),
diff --git a/src/accl/graph/sega/wl_engine.hh b/src/accl/graph/sega/wl_engine.hh
index 2c08e4e273..ad67f19cb5 100644
--- a/src/accl/graph/sega/wl_engine.hh
+++ b/src/accl/graph/sega/wl_engine.hh
@@ -107,7 +107,7 @@ class WLEngine : public BaseReduceEngine
 
     struct WorkListStats : public statistics::Group
     {
-      WorkListStats(WLEngine &worklist);
+      WorkListStats(WLEngine& worklist);
 
       void regStats() override;
 

From a1c592d74c00b04b899db7ac89ea8d553ff76406 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:35:07 -0700
Subject: [PATCH 278/287] Updating choosing next slice.

---
 src/accl/graph/sega/centeral_controller.cc | 31 ++++++++++++++--------
 1 file changed, 20 insertions(+), 11 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 488281fe84..add2296073 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -258,29 +258,38 @@ CenteralController::recvDoneSignal()
 int
 CenteralController::chooseNextSlice()
 {
-    int crowded_slice_id = -1;
+    int ret_slice_id = -1;
     int max_pending_count = 0;
-    // TODO: Make this general for all workloads
+    // TODO: Make this generalizable for all workloads.
     uint32_t best_update = -1;
-    int best_slice_id = -1;
-    int max_best_pending_count = 0;
     for (int i = 0; i < numTotalSlices; i++) {
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
-            crowded_slice_id = i;
         }
-        if (numPendingUpdates[i] > max_best_pending_count &&
-            workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
-            max_best_pending_count = numPendingUpdates[i];
-            best_slice_id = i;
         }
     }
     if (chooseBest) {
-        return best_slice_id;
+        int max_count = 0;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] > max_count &&
+                bestPendingUpdate[i] == best_update) {
+                max_count = numPendingUpdates[i];
+                ret_slice_id = i;
+            }
+        }
     } else {
-        return crowded_slice_id;
+        uint32_t best_value = -1;
+        for (int i = 0; i < numTotalSlices; i++) {
+            if (numPendingUpdates[i] == max_pending_count &&
+                workload->betterThan(bestPendingUpdate[i], best_value)) {
+                best_value = bestPendingUpdate[i];
+                ret_slice_id = i;
+            }
+        }
     }
+    return ret_slice_id;
 }
 
 void

From 052289af49c79ba6f39285b892281ea7e4c2c94b Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Thu, 13 Apr 2023 21:47:34 -0700
Subject: [PATCH 279/287] Fixing choosing next slice.

---
 src/accl/graph/sega/centeral_controller.cc | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index add2296073..7677376cc1 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -266,7 +266,8 @@ CenteralController::chooseNextSlice()
         if (numPendingUpdates[i] > max_pending_count) {
             max_pending_count = numPendingUpdates[i];
         }
-        if (workload->betterThan(bestPendingUpdate[i], best_update)) {
+        if (numPendingUpdates[i] > 0 &&
+            workload->betterThan(bestPendingUpdate[i], best_update)) {
             best_update = bestPendingUpdate[i];
         }
     }

From f3ed72c90e3d29e4ea10dd50ed6d079ad3d17a07 Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Apr 2023 17:39:56 -0700
Subject: [PATCH 280/287] Fixing sign extend issue when address is bigger than
 2GB.

---
 configs/accl/sega.py                       |  2 +-
 configs/accl/sega_simple.py                |  2 +-
 src/accl/graph/sega/centeral_controller.cc | 16 ++++++++--------
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/configs/accl/sega.py b/configs/accl/sega.py
index 2df36fab20..17d84bd86c 100644
--- a/configs/accl/sega.py
+++ b/configs/accl/sega.py
@@ -154,7 +154,7 @@ def __init__(self, mirror_bw):
                 latency="0ns",
                 latency_var="0ns",
                 bandwidth=mirror_bw,
-                range=AddrRange(start=0, size="4GiB"),
+                range=AddrRange(start=0, size="16GiB"),
                 in_addr_map=False,
             ),
         )
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index bdecd24c2f..4439ff9a2f 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -149,7 +149,7 @@ def __init__(self, mirror_bw):
                 latency="0ns",
                 latency_var="0ns",
                 bandwidth=mirror_bw,
-                range=AddrRange(start=0, size="4GiB"),
+                range=AddrRange(start=0, size="16GiB"),
                 in_addr_map=False,
             ),
         )
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 7677376cc1..ce0c700fe0 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -305,14 +305,14 @@ CenteralController::processNextSliceSwitchEvent()
             continue;
         }
         int updates_generated = 0;
-        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(int);
-        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(int);
+        Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
+        Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
         PacketPtr end = createReadPacket(end_pointer, sizeof(int));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
-        Addr start_addr = start->getLE<int>();
-        Addr end_addr = end->getLE<int>();
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
         delete start;
         delete end;
         DPRINTF(CenteralController, "%s: %d->%d: [%lu, %lu].\n", __func__,
@@ -384,14 +384,14 @@ CenteralController::processNextSliceSwitchEvent()
         if (src_id == currentSliceId) {
             continue;
         }
-        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(int);
-        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(int);
+        Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
+        Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
         PacketPtr start = createReadPacket(start_pointer, sizeof(int));
         PacketPtr end = createReadPacket(end_pointer, sizeof(int));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
-        Addr start_addr = start->getLE<int>();
-        Addr end_addr = end->getLE<int>();
+        Addr start_addr = start->getLE<uint64_t>();
+        Addr end_addr = end->getLE<uint64_t>();
         delete start;
         delete end;
 

From ec5d0439e45d33c068346502a096913776a3608a Mon Sep 17 00:00:00 2001
From: Mahyar Samani <msamani@ucdavis.edu>
Date: Mon, 17 Apr 2023 19:07:10 -0700
Subject: [PATCH 281/287] Fixing the packet size issue.

---
 src/accl/graph/sega/centeral_controller.cc | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index ce0c700fe0..6fb6abab38 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -307,8 +307,8 @@ CenteralController::processNextSliceSwitchEvent()
         int updates_generated = 0;
         Addr start_pointer = (currentSliceId * numTotalSlices + dst_id) * sizeof(uint64_t);
         Addr end_pointer = (currentSliceId * numTotalSlices + dst_id + 1) * sizeof(uint64_t);
-        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
-        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
         Addr start_addr = start->getLE<uint64_t>();
@@ -386,8 +386,8 @@ CenteralController::processNextSliceSwitchEvent()
         }
         Addr start_pointer = (src_id * numTotalSlices + currentSliceId) * sizeof(uint64_t);
         Addr end_pointer = (src_id * numTotalSlices + currentSliceId + 1) * sizeof(uint64_t);
-        PacketPtr start = createReadPacket(start_pointer, sizeof(int));
-        PacketPtr end = createReadPacket(end_pointer, sizeof(int));
+        PacketPtr start = createReadPacket(start_pointer, sizeof(uint64_t));
+        PacketPtr end = createReadPacket(end_pointer, sizeof(uint64_t));
         mapPort.sendFunctional(start);
         mapPort.sendFunctional(end);
         Addr start_addr = start->getLE<uint64_t>();

From 37e10f30fb3fd2357756d10d4c968884e09a8bab Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 16 Jan 2023 12:29:36 -0800
Subject: [PATCH 282/287] Initial commit for router

---
 src/accl/graph/sega/router_engine.cc | 2 +-
 src/accl/graph/sega/router_engine.hh | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index bcd8479df0..95bf6e9807 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -702,4 +702,4 @@ RouterEngine::RouterEngineStat::regStats()
             .desc("");
     }
 }
-}// namespace gem5
\ No newline at end of file
+}// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index 9bd44b6147..bf8a53d053 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -214,4 +214,4 @@ class RouterEngine : public ClockedObject
 
 }
 
-#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__
\ No newline at end of file
+#endif // __ACCL_GRAPH_SEGA_ROUTER_ENGINE_HH__

From c534735744a4234d715ea5d2d376ec6f97abb70a Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 30 Jan 2023 17:04:43 -0800
Subject: [PATCH 283/287] accl: Fixing the router

Retry mechanism and the config files are fixed. Tested with multiple GPTs/Router
---
 configs/accl/sega_simple.py                | 4 ++++
 configs/accl/sega_simple_pt2pt.py          | 8 ++++++++
 src/accl/graph/sega/centeral_controller.cc | 5 +++++
 src/accl/graph/sega/centeral_controller.hh | 4 +++-
 src/accl/graph/sega/router_engine.cc       | 7 +++++++
 src/accl/graph/sega/router_engine.hh       | 1 -
 6 files changed, 27 insertions(+), 2 deletions(-)

diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 4439ff9a2f..4c90051aa5 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -214,6 +214,10 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
         self.ctrl.router_vector = []
 
+        # self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+        # self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
+        # self.ctrl.router_vector = []
+
     def work_count(self):
         return self.ctrl.controller.workCount()
 
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index 9c2dd17481..b625d800d6 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -146,7 +146,11 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
+<<<<<<< HEAD
         GPTPerGPN = 8
+=======
+        GPTPerGPN = 2
+>>>>>>> accl: Fixing the router
 
         # Building the CenteralController
         self.ctrl = CenteralController(
@@ -174,10 +178,14 @@ def __init__(self, num_gpts, num_registers, cache_size,
             gpts.append(gpt)
         for i in range(int(num_gpts/GPTPerGPN)):
             routers.append(
+<<<<<<< HEAD
                         RouterEngine(
                                 gpn_queue_size = r_queue_size,
                                 gpt_queue_size = r_queue_size,
                                 router_latency = r_latency))
+=======
+                        RouterEngine(gpn_queue_size = 64, gpt_queue_size = 64))
+>>>>>>> accl: Fixing the router
         self.routers = routers
         # for gpt_0 in gpts:
         #     for gpt_1 in gpts:
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6fb6abab38..6adf636845 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -73,6 +73,11 @@ CenteralController::getPort(const std::string& if_name, PortID idx)
     } else {
         return ClockedObject::getPort(if_name, idx);
     }
+
+    for (auto router : params.router_vector) {
+        routerVector.push_back(router);
+        router->registerCenteralController(this);
+    }
 }
 
 void
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 883437a202..5313d514e6 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -76,7 +76,9 @@ class CenteralController : public BaseMemoryEngine
     memory::SimpleMemory* mirrorsMem;
 
     std::vector<MPU*> mpuVector;
-    AddrRangeMap<MPU*> mpuAddrMap;
+    std::vector<RouterEngine*> routerVector;
+
+    std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
 
     int currentSliceId;
     int numTotalSlices;
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 95bf6e9807..e7765916e2 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -89,6 +89,13 @@ RouterEngine::registerCenteralController(
     centeralController = centeral_controller;
 }
 
+void
+RouterEngine::registerCenteralController(
+                                    CenteralController* centeral_controller)
+{
+    centeralController = centeral_controller;
+}
+
 AddrRangeList
 RouterEngine::GPTRespPort::getAddrRanges() const
 {
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index bf8a53d053..a69f4c3ca7 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -196,7 +196,6 @@ class RouterEngine : public ClockedObject
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
-
     void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
     virtual void startup() override;

From f4839e8443bf0dd2019329ac1e1c661b09f3b98e Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Mon, 6 Feb 2023 16:26:07 -0800
Subject: [PATCH 284/287] accl: Improving the router latency

Adjust the router latency based on each links last service
---
 configs/accl/bfs.py                  |  2 ++
 configs/accl/sega_simple_pt2pt.py    | 12 ++----------
 src/accl/graph/sega/router_engine.hh |  9 ++++++++-
 3 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 1a58a5fc41..6a9ea4fb13 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -145,8 +145,10 @@ def get_inputs():
             from sega_simple import SEGA
     else:
         from sega import SEGA
+
     system = SEGA(num_gpts, num_registers, cache_size,
                                                 r_queue_size, r_latency, graph)
+
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index b625d800d6..d44f488886 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -146,11 +146,7 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.cache_line_size = 32
         self.mem_mode = "timing"
 
-<<<<<<< HEAD
         GPTPerGPN = 8
-=======
-        GPTPerGPN = 2
->>>>>>> accl: Fixing the router
 
         # Building the CenteralController
         self.ctrl = CenteralController(
@@ -177,15 +173,11 @@ def __init__(self, num_gpts, num_registers, cache_size,
             )
             gpts.append(gpt)
         for i in range(int(num_gpts/GPTPerGPN)):
-            routers.append(
-<<<<<<< HEAD
-                        RouterEngine(
+            routers.append(RouterEngine(
                                 gpn_queue_size = r_queue_size,
                                 gpt_queue_size = r_queue_size,
                                 router_latency = r_latency))
-=======
-                        RouterEngine(gpn_queue_size = 64, gpt_queue_size = 64))
->>>>>>> accl: Fixing the router
+
         self.routers = routers
         # for gpt_0 in gpts:
         #     for gpt_1 in gpts:
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index a69f4c3ca7..d6c9650fc9 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -192,10 +192,13 @@ class RouterEngine : public ClockedObject
       std::vector<statistics::Histogram *> internalTrafficHist;
     };
     RouterEngineStat stats;
-
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
+<<<<<<< HEAD
+=======
+
+>>>>>>> accl: Improving the router latency
     void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
     virtual void startup() override;
@@ -209,6 +212,10 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
+<<<<<<< HEAD
+=======
+
+>>>>>>> accl: Improving the router latency
 };
 
 }

From f4acaeb8ad1d64549c51cc9ed569af8d059994b6 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Aug 2023 10:55:11 -0700
Subject: [PATCH 285/287] Update router with link bandwidth and sampling

---
 configs/accl/bc.py                         |  35 ++++-
 configs/accl/bfs.py                        |  14 +-
 configs/accl/cc.py                         |  37 ++++-
 configs/accl/pr.py                         |  30 +++-
 configs/accl/sega-simple.py                | 165 ---------------------
 configs/accl/sega_simple.py                |   2 +-
 configs/accl/sega_simple_pt2pt.py          | 132 ++++++++++++-----
 configs/accl/sssp.py                       |  35 ++++-
 src/accl/graph/sega/RouterEngine.py        |   3 +
 src/accl/graph/sega/centeral_controller.cc |   2 +-
 src/accl/graph/sega/centeral_controller.hh |   1 +
 src/accl/graph/sega/router_engine.cc       |  95 ++++++++----
 src/accl/graph/sega/router_engine.hh       |  14 ++
 13 files changed, 325 insertions(+), 240 deletions(-)
 delete mode 100644 configs/accl/sega-simple.py

diff --git a/configs/accl/bc.py b/configs/accl/bc.py
index 56faeb3e4d..9a0bf298b5 100644
--- a/configs/accl/bc.py
+++ b/configs/accl/bc.py
@@ -36,9 +36,14 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -47,6 +52,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -70,10 +83,16 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample_time,
+        args.tokens,
         args.simple,
+        args.pt2pt,
         args.sample,
         args.verify,
     )
@@ -84,19 +103,31 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
         graph,
         init_addr,
         init_value,
+        sample_time,
+        tokens,
         simple,
+        pt2pt,
         sample,
         verify,
     ) = get_inputs()
 
     if simple:
-        from sega_simple import SEGA
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/bfs.py b/configs/accl/bfs.py
index 6a9ea4fb13..c2150ce751 100644
--- a/configs/accl/bfs.py
+++ b/configs/accl/bfs.py
@@ -38,9 +38,12 @@ def get_inputs():
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("r_queue_size", type=int)
     argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
     argparser.add_argument(
         "--tile",
         dest="tile",
@@ -106,9 +109,12 @@ def get_inputs():
         args.cache_size,
         args.r_queue_size,
         args.r_latency,
+        args.gpt_per_gpn,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample_time,
+        args.tokens,
         args.tile,
         args.best,
         args.visited,
@@ -126,9 +132,12 @@ def get_inputs():
         cache_size,
         r_queue_size,
         r_latency,
+        gpt_per_gpn,
         graph,
         init_addr,
         init_value,
+        sample_time,
+        tokens,
         tile,
         best,
         visited,
@@ -141,8 +150,11 @@ def get_inputs():
     if simple:
         if pt2pt:
             from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
         else:
             from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
     else:
         from sega import SEGA
 
@@ -150,9 +162,7 @@ def get_inputs():
                                                 r_queue_size, r_latency, graph)
 
     root = Root(full_system=False, system=system)
-
     m5.instantiate()
-
     if tile:
         system.set_pg_mode()
     else:
diff --git a/configs/accl/cc.py b/configs/accl/cc.py
index 9b6d2b587d..03b3d04d46 100644
--- a/configs/accl/cc.py
+++ b/configs/accl/cc.py
@@ -36,7 +36,12 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
     argparser.add_argument("graph", type=str)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -45,6 +50,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -68,8 +81,14 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
         args.graph,
+        args.sample_time,
+        args.tokens,
         args.simple,
+        args.pt2pt,
         args.sample,
         args.verify,
     )
@@ -80,17 +99,29 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
         graph,
+        sample_time,
+        tokens,
         simple,
+        pt2pt,
         sample,
         verify,
     ) = get_inputs()
-
+    print(sample_time)
     if simple:
-        from sega_simple import SEGA
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/pr.py b/configs/accl/pr.py
index a7f30f02e9..7ef6587ab3 100644
--- a/configs/accl/pr.py
+++ b/configs/accl/pr.py
@@ -38,9 +38,12 @@ def get_inputs():
     argparser.add_argument("cache_size", type=str)
     argparser.add_argument("r_queue_size", type=int)
     argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("iterations", type=int)
     argparser.add_argument("alpha", type=float)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
     argparser.add_argument("--num_nodes", type=int, default=1)
     argparser.add_argument("--error_threshold", type=float, default=0.0)
     argparser.add_argument(
@@ -51,6 +54,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -76,12 +87,16 @@ def get_inputs():
         args.cache_size,
         args.r_queue_size,
         args.r_latency,
+        args.gpt_per_gpn,
         args.graph,
         args.iterations,
         args.alpha,
         args.num_nodes,
         args.error_threshold,
+        args.sample_time,
+        args.tokens,
         args.simple,
+        args.pt2pt,
         args.sample,
         args.verify,
     )
@@ -94,12 +109,16 @@ def get_inputs():
         cache_size,
         r_queue_size,
         r_latency,
+        gpt_per_gpn,
         graph,
         iterations,
         alpha,
         num_nodes,
         error_threshold,
+        sample_time,
+        tokens,
         simple,
+        pt2pt,
         sample,
         verify,
     ) = get_inputs()
@@ -107,11 +126,16 @@ def get_inputs():
     print(f"error_threshold: {error_threshold}")
 
     if simple:
-        from sega_simple_pt2pt import SEGA
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                                    r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size,
-                                                r_queue_size, r_latency, graph)
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/configs/accl/sega-simple.py b/configs/accl/sega-simple.py
deleted file mode 100644
index 54a90281bf..0000000000
--- a/configs/accl/sega-simple.py
+++ /dev/null
@@ -1,165 +0,0 @@
-# Copyright (c) 2022 The Regents of the University of California
-# All rights reserved.
-#
-# Redistribution and use in source and binary forms, with or without
-# modification, are permitted provided that the following conditions are
-# met: redistributions of source code must retain the above copyright
-# notice, this list of conditions and the following disclaimer;
-# redistributions in binary form must reproduce the above copyright
-# notice, this list of conditions and the following disclaimer in the
-# documentation and/or other materials provided with the distribution;
-# neither the name of the copyright holders nor the names of its
-# contributors may be used to endorse or promote products derived from
-# this software without specific prior written permission.
-#
-# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
-# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
-# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
-# A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
-# OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
-# SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
-# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
-# DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
-# THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
-# (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
-# OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
-
-import m5
-import argparse
-
-from math import log
-from m5.objects import *
-
-def interleave_addresses(plain_range, num_channels, cache_line_size):
-        intlv_low_bit = log(cache_line_size, 2)
-        intlv_bits = log(num_channels, 2)
-        ret = []
-        for i in range(num_channels):
-            ret.append(AddrRange(
-                start=plain_range.start,
-                size=plain_range.size(),
-                intlvHighBit=intlv_low_bit + intlv_bits - 1,
-                xorHighBit=0,
-                intlvBits=intlv_bits,
-                intlvMatch=i))
-        return ret
-
-class GPT(SubSystem):
-    def __init__(self, edge_memory_size: str, cache_size: str):
-        super().__init__()
-        self.wl_engine = WLEngine(
-                                update_queue_size=128,
-                                register_file_size=64
-                                )
-        self.coalesce_engine = CoalesceEngine(
-                                            attached_memory_atom_size=32,
-                                            cache_size=cache_size,
-                                            num_mshr_entry=64,
-                                            num_tgts_per_mshr=64,
-                                            max_resp_per_cycle=8
-                                            )
-        self.push_engine = PushEngine(
-                                    push_req_queue_size=32,
-                                    attached_memory_atom_size=64,
-                                    resp_queue_size=64,
-                                    update_queue_size=16,
-                                    )
-
-        self.vertex_mem_ctrl = SimpleMemory(
-                                        latency="0ns",
-                                        latency_var="0ns",
-                                        bandwidth="0GB/s"
-                                        )
-
-        self.edge_mem_ctrl = SimpleMemory(
-                                        latency="30ns",
-                                        latency_var="0ns",
-                                        bandwidth="32GB/s",
-                                        range=AddrRange(edge_memory_size),
-                                        in_addr_map=False
-                                        )
-
-        self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
-        self.push_engine.mem_port = self.edge_mem_ctrl.port
-
-        self.mpu = MPU(
-                    wl_engine=self.wl_engine,
-                    coalesce_engine=self.coalesce_engine,
-                    push_engine=self.push_engine
-                    )
-
-    def getRespPort(self):
-        return self.wl_engine.in_ports
-    def setRespPort(self, port):
-        self.wl_engine.in_ports = port
-
-    def getReqPort(self):
-        return self.push_engine.out_ports
-    def setReqPort(self, port):
-        self.push_engine.out_ports = port
-
-    def set_vertex_range(self, vertex_range):
-        self.vertex_mem_ctrl.range = vertex_range
-    def set_edge_image(self, edge_image):
-        self.edge_mem_ctrl.image_file = edge_image
-
-class SEGA(System):
-    def __init__(self, num_mpus, cache_size, graph_path):
-        super(SEGA, self).__init__()
-        self.clk_domain = SrcClockDomain()
-        self.clk_domain.clock = '2GHz'
-        self.clk_domain.voltage_domain = VoltageDomain()
-        self.cache_line_size = 32
-        self.mem_mode = "timing"
-
-        self.ctrl = CenteralController(image_file=f"{graph_path}/vertices")
-
-        vertex_ranges = interleave_addresses(
-                                        AddrRange(start=0, size="4GiB"),
-                                        num_mpus,
-                                        32
-                                        )
-
-        gpts = []
-        for i in range(num_mpus):
-            gpt = GPT("8GiB", cache_size)
-            gpt.set_vertex_range(vertex_ranges[i])
-            gpt.set_edge_image(f"{graph_path}/edgelist_{i}")
-            gpts.append(gpt)
-        # Creating the interconnect among mpus
-        for gpt_0 in gpts:
-            for gpt_1 in gpts:
-                gpt_0.setReqPort(gpt_1.getRespPort())
-        self.gpts = gpts
-
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-
-    def create_initial_bfs_update(self, init_addr, init_value):
-        self.ctrl.createInitialBFSUpdate(init_addr, init_value)
-
-def get_inputs():
-    argparser = argparse.ArgumentParser()
-    argparser.add_argument("num_gpts", type=int)
-    argparser.add_argument("cache_size", type=str)
-    argparser.add_argument("graph", type=str)
-    argparser.add_argument("init_addr", type=int)
-    argparser.add_argument("init_value", type=int)
-
-    args = argparser.parse_args()
-
-    return args.num_gpts, args.cache_size, \
-        args.graph, args.init_addr, args.init_value
-
-if __name__ == "__m5_main__":
-    num_gpts, cache_size, graph, init_addr, init_value = get_inputs()
-
-    system = SEGA(num_gpts, cache_size, graph)
-    root = Root(full_system = False, system = system)
-
-    m5.instantiate()
-
-    system.create_initial_bfs_update(init_addr, init_value)
-
-    exit_event = m5.simulate()
-    print(f"Exited simulation at tick {m5.curTick()} " + \
-            f"because {exit_event.getCause()}")
diff --git a/configs/accl/sega_simple.py b/configs/accl/sega_simple.py
index 4c90051aa5..08f0f181ba 100644
--- a/configs/accl/sega_simple.py
+++ b/configs/accl/sega_simple.py
@@ -264,4 +264,4 @@ def create_bc_workload(self, init_addr, init_value):
         self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.controller.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
\ No newline at end of file
diff --git a/configs/accl/sega_simple_pt2pt.py b/configs/accl/sega_simple_pt2pt.py
index d44f488886..5b7309d44f 100644
--- a/configs/accl/sega_simple_pt2pt.py
+++ b/configs/accl/sega_simple_pt2pt.py
@@ -52,10 +52,10 @@ def __init__(self, register_file_size: int, cache_size: str):
         self.wl_engine = WLEngine(
             update_queue_size=64,
             register_file_size=register_file_size,
+            examine_window=8,
             rd_per_cycle=4,
             reduce_per_cycle=32,
             wr_per_cycle=4,
-            num_updates_processed=8,
         )
         self.coalesce_engine = CoalesceEngine(
             attached_memory_atom_size=32,
@@ -64,17 +64,19 @@ def __init__(self, register_file_size: int, cache_size: str):
             pending_pull_limit=64,
             active_buffer_size=80,
             post_push_wb_queue_size=64,
+            transitions_per_cycle=4,
         )
         self.push_engine = PushEngine(
             push_req_queue_size=32,
             attached_memory_atom_size=64,
-            resp_queue_size=4096,
+            resp_queue_size=1024,
+            examine_window=12,
             max_propagates_per_cycle=8,
-            update_queue_size=32,
+            update_queue_size=64,
         )
 
         self.vertex_mem_ctrl = SimpleMemory(
-            latency="122ns", latency_var="0ns", bandwidth="28GiB/s"
+            latency="120ns", bandwidth="256GiB/s"
         )
         self.coalesce_engine.mem_port = self.vertex_mem_ctrl.port
 
@@ -131,11 +133,50 @@ def setPort(self, port):
         self.xbar.cpu_side_ports = port
 
 
+class SEGAController(SubSystem):
+    def __init__(self, mirror_bw):
+        super().__init__()
+        self.map_mem = SimpleMemory(
+            latency="0ns",
+            latency_var="0ns",
+            bandwidth="1024GiB/s",
+            range=AddrRange(start=0, size="4GiB"),
+            in_addr_map=False,
+        )
+        self.controller = CenteralController(
+            choose_best=False,
+            mirrors_mem=SimpleMemory(
+                latency="0ns",
+                latency_var="0ns",
+                bandwidth=mirror_bw,
+                range=AddrRange(start=0, size="16GiB"),
+                in_addr_map=False,
+            ),
+        )
+        self.controller.mem_port = self.controller.mirrors_mem.port
+        self.controller.mirrors_map_mem = self.map_mem.port
+
+    def set_choose_best(self, choose_best):
+        self.controller.choose_best = choose_best
+
+    def set_vertices_image(self, vertices):
+        self.controller.vertex_image_file = vertices
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.controller.mirrors_mem.image_file = mirrors
+        self.map_mem.image_file = mirrors_map
+
+    def set_mpu_vector(self, mpu_vector):
+        self.controller.mpu_vector = mpu_vector
+    
+    def set_router_vector(self, router_vector):
+        self.controller.router_vector = router_vector
+
+
 class SEGA(System):
-    def __init__(self, num_gpts, num_registers, cache_size, 
-                                        r_queue_size, r_latency, graph_path):
+    def __init__(self, num_gpts, num_registers, cache_size,
+                            r_queue_size, r_latency, gpt_per_gpn, graph_path, sample_time, tokens):
         super(SEGA, self).__init__()
-        # num_gpts should be an even power of 2
         assert num_gpts != 0
         assert num_gpts % 2 == 0
         assert (num_gpts & (num_gpts - 1)) == 0
@@ -145,17 +186,17 @@ def __init__(self, num_gpts, num_registers, cache_size,
         self.clk_domain.voltage_domain = VoltageDomain()
         self.cache_line_size = 32
         self.mem_mode = "timing"
+        
+        gpts = []
+        routers = []
+        GPTPerGPN = gpt_per_gpn
 
-        GPTPerGPN = 8
+        self.ctrl = SEGAController("256GiB/s")
+        self.ctrl.set_vertices_image(f"{graph_path}/vertices")
 
-        # Building the CenteralController
-        self.ctrl = CenteralController(
-            vertex_image_file=f"{graph_path}/vertices"
-        )
-        # Building the EdgeMemories
         edge_mem = []
         for i in range(int(num_gpts / 2)):
-            mem = EdgeMemory("4GiB")
+            mem = EdgeMemory("16GiB")
             mem.set_image(f"{graph_path}/edgelist_{i}")
             edge_mem.append(mem)
         self.edge_mem = edge_mem
@@ -163,8 +204,7 @@ def __init__(self, num_gpts, num_registers, cache_size,
         vertex_ranges = interleave_addresses(
             AddrRange(start=0, size="4GiB"), num_gpts, 32
         )
-        gpts = []
-        routers = []
+        
         for i in range(num_gpts):
             gpt = GPT(num_registers, cache_size)
             gpt.set_vertex_range(vertex_ranges[i])
@@ -172,6 +212,12 @@ def __init__(self, num_gpts, num_registers, cache_size,
                 self.edge_mem[i % (int(num_gpts / 2))].getPort()
             )
             gpts.append(gpt)
+       
+        # Creating the interconnect among mpus
+        # for gpt_0 in gpts:
+        #     for gpt_1 in gpts:
+        #         gpt_0.setReqPort(gpt_1.getRespPort())
+
         for i in range(int(num_gpts/GPTPerGPN)):
             routers.append(RouterEngine(
                                 gpn_queue_size = r_queue_size,
@@ -186,57 +232,71 @@ def __init__(self, num_gpts, num_registers, cache_size,
         for i in range(len(gpts)):
             for j in range(len(gpts)):
                 if (int(i / GPTPerGPN) == int(j / GPTPerGPN) ):
-                    print(i, j)
+                    # print(i, j)
                     gpts[i].setReqPort(gpts[j].getRespPort())
-        print("gpt, Router")
+        # print("gpt, Router")
         for i in range(len(gpts)):
             for j in range(len(routers)):
                 if (int(i / GPTPerGPN) == j):
-                    print(i, j)
+                    # print(i, j)
                     gpts[i].setRespPort(routers[j].gpt_req_side)
                     gpts[i].setReqPort(routers[j].gpt_resp_side)
+        # print("router, router")
         for r_0 in routers:
             for r_1 in routers:
                 if r_0 != r_1:
-                    r_0.gpn_resp_side = r_1.gpn_req_side
+                    # print(r_0, r_1)
+                    r_0.gpn_resp_side = r_1.gpn_req_side 
         self.gpts = gpts
-        self.ctrl.mpu_vector = [gpt.mpu for gpt in self.gpts]
-        self.ctrl.router_vector = [r for r in self.routers]
+        self.routers = routers
+        
+        self.ctrl.set_mpu_vector([gpt.mpu for gpt in self.gpts])
+        self.ctrl.set_router_vector([r for r in self.routers])
 
     def work_count(self):
-        return self.ctrl.workCount()
+        return self.ctrl.controller.workCount()
 
     def set_async_mode(self):
-        self.ctrl.setAsyncMode()
+        self.ctrl.controller.setAsyncMode()
 
     def set_bsp_mode(self):
-        self.ctrl.setBSPMode()
+        self.ctrl.controller.setBSPMode()
+
+    def set_pg_mode(self):
+        self.ctrl.controller.setPGMode()
+
+    def set_aux_images(self, mirrors, mirrors_map):
+        self.ctrl.set_aux_images(mirrors, mirrors_map)
+
+    def set_choose_best(self, choose_best):
+        self.ctrl.set_choose_best(choose_best)
 
     def create_pop_count_directory(self, atoms_per_block):
-        self.ctrl.createPopCountDirectory(atoms_per_block)
+        self.ctrl.controller.createPopCountDirectory(atoms_per_block)
 
     def create_bfs_workload(self, init_addr, init_value):
-        self.ctrl.createBFSWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSWorkload(init_addr, init_value)
 
     def create_bfs_visited_workload(self, init_addr, init_value):
-        self.ctrl.createBFSVisitedWorkload(init_addr, init_value)
+        self.ctrl.controller.createBFSVisitedWorkload(init_addr, init_value)
 
     def create_sssp_workload(self, init_addr, init_value):
-        self.ctrl.createSSSPWorkload(init_addr, init_value)
+        self.ctrl.controller.createSSSPWorkload(init_addr, init_value)
 
     def create_cc_workload(self):
-        self.ctrl.createCCWorkload()
+        self.ctrl.controller.createCCWorkload()
 
     def create_async_pr_workload(self, alpha, threshold):
-        self.ctrl.createAsyncPRWorkload(alpha, threshold)
-    def get_pr_error(self):
-        return self.ctrl.getPRError()
+        self.ctrl.controller.createAsyncPRWorkload(alpha, threshold)
 
     def create_pr_workload(self, num_nodes, alpha):
-        self.ctrl.createPRWorkload(num_nodes, alpha)
+        self.ctrl.controller.createPRWorkload(num_nodes, alpha)
+
+    def get_pr_error(self):
+        return self.ctrl.controller.getPRError()
 
     def create_bc_workload(self, init_addr, init_value):
-        self.ctrl.createBCWorkload(init_addr, init_value)
+        self.ctrl.controller.createBCWorkload(init_addr, init_value)
 
     def print_answer(self):
-        self.ctrl.printAnswerToHostSimout()
+        self.ctrl.controller.printAnswerToHostSimout()
diff --git a/configs/accl/sssp.py b/configs/accl/sssp.py
index f2e60b856a..08581bbb81 100644
--- a/configs/accl/sssp.py
+++ b/configs/accl/sssp.py
@@ -36,9 +36,14 @@ def get_inputs():
     argparser.add_argument("num_gpts", type=int)
     argparser.add_argument("num_registers", type=int)
     argparser.add_argument("cache_size", type=str)
+    argparser.add_argument("r_queue_size", type=int)
+    argparser.add_argument("r_latency", type=int)
+    argparser.add_argument("gpt_per_gpn", type=int)
     argparser.add_argument("graph", type=str)
     argparser.add_argument("init_addr", type=int)
     argparser.add_argument("init_value", type=int)
+    argparser.add_argument("sample_time", type=str)
+    argparser.add_argument("tokens", type=int)
     argparser.add_argument(
         "--simple",
         dest="simple",
@@ -47,6 +52,14 @@ def get_inputs():
         default=False,
         help="Use simple memory for vertex",
     )
+    argparser.add_argument(
+        "--pt2pt",
+        dest="pt2pt",
+        action="store_const",
+        const=True,
+        default=False,
+        help="Use simple memory for vertex",
+    )
     argparser.add_argument(
         "--sample",
         dest="sample",
@@ -70,10 +83,16 @@ def get_inputs():
         args.num_gpts,
         args.num_registers,
         args.cache_size,
+        args.r_queue_size,
+        args.r_latency,
+        args.gpt_per_gpn,
         args.graph,
         args.init_addr,
         args.init_value,
+        args.sample_time,
+        args.tokens,
         args.simple,
+        args.pt2pt,
         args.sample,
         args.verify,
     )
@@ -84,19 +103,31 @@ def get_inputs():
         num_gpts,
         num_registers,
         cache_size,
+        r_queue_size,
+        r_latency,
+        gpt_per_gpn,
         graph,
         init_addr,
         init_value,
+        sample_time,
+        tokens,
         simple,
+        pt2pt,
         sample,
         verify,
     ) = get_inputs()
 
     if simple:
-        from sega_simple import SEGA
+        if pt2pt:
+            from sega_simple_pt2pt import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size,
+                r_queue_size, r_latency, gpt_per_gpn, graph, sample_time, tokens)
+        else:
+            from sega_simple import SEGA
+            system = SEGA(num_gpts, num_registers, cache_size, graph)
     else:
         from sega import SEGA
-    system = SEGA(num_gpts, num_registers, cache_size, graph)
+        system = SEGA(num_gpts, num_registers, cache_size, graph)
     root = Root(full_system=False, system=system)
 
     m5.instantiate()
diff --git a/src/accl/graph/sega/RouterEngine.py b/src/accl/graph/sega/RouterEngine.py
index 8182e81720..2b895b9323 100644
--- a/src/accl/graph/sega/RouterEngine.py
+++ b/src/accl/graph/sega/RouterEngine.py
@@ -43,5 +43,8 @@ class RouterEngine(ClockedObject):
     gpn_resp_side = VectorResponsePort("incoming ports from local GPNs")
     gpt_queue_size = Param.Int(64, "Queue size on the gpt side")
     gpn_queue_size = Param.Int(64, "Queue size on the gpt side")
+    token = Param.Int("Number of tokens sent per time sample.")
     router_latency = Param.Cycles(5, "Router latency, "
                                 "SerDes or E-O-E latencies can be added here")
+    
+    sample_time = Param.Latency("50us", "Intervals to sample traffic")
diff --git a/src/accl/graph/sega/centeral_controller.cc b/src/accl/graph/sega/centeral_controller.cc
index 6adf636845..09b57b6ff6 100644
--- a/src/accl/graph/sega/centeral_controller.cc
+++ b/src/accl/graph/sega/centeral_controller.cc
@@ -74,7 +74,7 @@ CenteralController::getPort(const std::string& if_name, PortID idx)
         return ClockedObject::getPort(if_name, idx);
     }
 
-    for (auto router : params.router_vector) {
+    for (auto router : params().router_vector) {
         routerVector.push_back(router);
         router->registerCenteralController(this);
     }
diff --git a/src/accl/graph/sega/centeral_controller.hh b/src/accl/graph/sega/centeral_controller.hh
index 5313d514e6..ac06b76edc 100644
--- a/src/accl/graph/sega/centeral_controller.hh
+++ b/src/accl/graph/sega/centeral_controller.hh
@@ -76,6 +76,7 @@ class CenteralController : public BaseMemoryEngine
     memory::SimpleMemory* mirrorsMem;
 
     std::vector<MPU*> mpuVector;
+    AddrRangeMap<MPU*> mpuAddrMap;
     std::vector<RouterEngine*> routerVector;
 
     std::unordered_map<MPU*, AddrRangeList> addrRangeListMap;
diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index e7765916e2..4d433a71fb 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -42,12 +42,17 @@ RouterEngine::RouterEngine(const Params &params):
   gpnQSize(params.gpn_queue_size),
   emptyQueues(false),
   routerLatency(params.router_latency),
+  start(0),
+  sampleTime(params.sample_time),
+  tokens(params.token),
   nextGPTGPNEvent([this] { processNextGPTGPNEvent(); }, name()),
   nextInternalRequestEvent(
                         [this] { processNextInternalRequestEvent(); }, name()),
   nextGPNGPTEvent([this] { processNextGPNGPTEvent(); }, name()),
   nextExternalRequestEvent(
                         [this] { processNextExternalRequestEvent(); }, name()),
+  nextTrafficTrackEvent(
+                        [this] { processNextTrafficTrackEvent(); }, name()),
   stats(*this)
 {
 
@@ -57,28 +62,26 @@ RouterEngine::RouterEngine(const Params &params):
         // m_newTraffic.emplace_back(new statistics::Histogram());
         // m_newTraffic[i]->init(10);
     }
-
     for (int i = 0; i < params.port_gpt_resp_side_connection_count; ++i) {
         gptRespPorts.emplace_back(
                     name() + ".gpt_resp_side" + std::to_string(i), this, i);
     }
-
     for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
         gpnReqPorts.emplace_back(
                     name() + ".gpn_req_side" + std::to_string(i), this, i);
     }
-
     for (int i = 0; i < params.port_gpn_resp_side_connection_count; ++i) {
         gpnRespPorts.emplace_back(
                     name() + ".gpn_resp_side" + std::to_string(i), this, i);
     }
-
     for (int i = 0; i <params.port_gpt_req_side_connection_count; ++i) {
         externalLatency[i] = curCycle();
     }
-
     for (int i = 0; i < params.port_gpn_req_side_connection_count; ++i) {
         internalLatency[i] = curCycle();
+        tokenVector.emplace_back(0);
+        inFlightTraffic.emplace_back(0);
+        sample.emplace_back(0);
     }
 }
 
@@ -130,6 +133,7 @@ RouterEngine::getGPTRanges()
     for (auto &gptPort : gptReqPorts) {
         for (auto &addr_range : gptPort.getAddrRanges()) {
             ret.push_back(addr_range);
+            // std::cout<<"HERE:"<<&addr_range<<std::endl;
         }
     }
     return ret;
@@ -141,24 +145,15 @@ RouterEngine::init()
     for (int i = 0; i < gptReqPorts.size(); i++) {
         gptAddrMap[gptReqPorts[i].id()] = gptReqPorts[i].getAddrRanges();
     }
-    std::cout<<"gptReqPorts: "<<gptReqPorts.size()<<std::endl;
 }
 
-// void
-// RouterEngine::resetStats()
-// {
-//     for (int i = 0; i < gptReqPorts.size(); i++) {
-//         m_newTraffic[i]->reset();
-//     }
-// }
-
 void
 RouterEngine::startup()
 {
     for (int i = 0; i < gpnReqPorts.size(); i++) {
         routerAddrMap[gpnReqPorts[i].id()] = gpnReqPorts[i].getAddrRanges();
+        tokenVector[i] = tokens;
     }
-    std::cout<<"gpnReqPorts: "<<gpnReqPorts.size()<<std::endl;
 }
 
 bool
@@ -351,7 +346,6 @@ RouterEngine::processNextGPTGPNEvent()
                                 "gpnRespQueue[%d]. gpnRespQueue size is: %d\n",
                                 __func__, pkt->getAddr(), i,
                                 gpnRespQueues[gpnReqPorts[i].id()].size());
-                        stats.internalTrafficHist[gpnReqPorts[i].id()]->sample(gpnRespQueues[gpnReqPorts[i].id()].size());
                         queue.second.pop();
                         DPRINTF(RouterEngine, "%s: gptReqQueue size is: %d.\n",
                                                 __func__, queue.second.size());
@@ -398,23 +392,32 @@ RouterEngine::processNextInternalRequestEvent()
     DPRINTF(RouterEngine, "%s: Sending a request between two routers.\n",
                                                                     __func__);
     bool none_empty_queue = false;
+    int id;
     for (auto &queue: gpnRespQueues) {
         if (!queue.second.empty()) {
-            if (!gpnReqPorts[queue.first].blocked()) {
+            id = gpnReqPorts[queue.first].id();
+            if (!gpnReqPorts[queue.first].blocked() && (tokenVector[id] != 0)) {
                 if  ((curCycle() - 
                     internalLatency[gpnReqPorts[queue.first].id()]) 
                     < routerLatency) {
                     continue;
-                } 
-                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                }
                 PacketPtr pkt = queue.second.front();
                 DPRINTF(RouterEngine, "%s: Sending packet %s to router: %d.\n",
                     __func__, pkt->getAddr(), gpnReqPorts[queue.first].id());
                 gpnReqPorts[queue.first].sendPacket(pkt);
+                inFlightTraffic[queue.first]++;
                 queue.second.pop();
                 internalLatency[gpnReqPorts[queue.first].id()] = curCycle();
-            } 
-            else {
+                stats.internalAcceptedTraffic[gpnReqPorts[queue.first].id()]++;
+                stats.totalInternalTraffic[gpnReqPorts[queue.first].id()] += 
+                                                                sizeof(pkt);
+                tokenVector[id]--;
+            } else if (tokenVector[id] == 0) {
+                DPRINTF(RouterEngine, "%s: Rand out of tokens for port id %d.\n",
+                    __func__, id);
+                stats.bandwidthBlocked[id]++;
+            } else {
                 DPRINTF(RouterEngine, "%s: port id %d is blocked.\n",
                     __func__, gpnReqPorts[queue.first].id());
                 stats.internalBlockedTraffic[gpnReqPorts[queue.first].id()]++;
@@ -457,6 +460,30 @@ RouterEngine::processNextInternalRequestEvent()
     if (none_empty_queue && (!nextInternalRequestEvent.scheduled())) {
         schedule(nextInternalRequestEvent, next_schedule);
     }
+
+    if(!nextTrafficTrackEvent.scheduled() && (start == 0)) {
+        start = 1;
+        schedule(nextTrafficTrackEvent, next_schedule);
+    }
+}
+
+void
+RouterEngine::processNextTrafficTrackEvent()
+{
+    for (auto &queue: gpnRespQueues) {
+        stats.internalTrafficHist[queue.first]->sample(inFlightTraffic[queue.first]);
+        // stats.internalTrafficVector[queue.first][sample[queue.first]] = inFlightTraffic[queue.first];
+        sample[queue.first]++;
+        inFlightTraffic[queue.first] = 0;
+    }
+
+    for (int i = 0; i < gpnReqPorts.size(); i++) {
+        tokenVector[i] = tokens;
+    }
+
+    if(!nextTrafficTrackEvent.scheduled()) {
+        schedule(nextTrafficTrackEvent, curTick() + sampleTime);
+    }
 }
 
 void
@@ -688,7 +715,14 @@ RouterEngine::RouterEngineStat::RouterEngineStat(RouterEngine &_router)
     ADD_STAT(internalAcceptedTraffic, statistics::units::Count::get(),
              "Number of packet passed between routers."),
     ADD_STAT(externalAcceptedTraffic, statistics::units::Count::get(),
-             "Number of external packets passed.")
+             "Number of external packets passed."),
+    ADD_STAT(bandwidthBlocked, statistics::units::Count::get(),
+             "Number of packets blocked due to lack of."),
+    ADD_STAT(totalInternalTraffic, statistics::units::Count::get(),
+             "Total traffic sent from the internal port")
+            //  ,
+    // ADD_STAT(internalTrafficVector, statistics::units::Count::get(),
+    //          "Number of requests sent in internal link")
 {}
 
 void
@@ -700,13 +734,24 @@ RouterEngine::RouterEngineStat::regStats()
     externalBlockedTraffic.init(router.gptReqPorts.size());
     internalAcceptedTraffic.init(router.gpnReqPorts.size());
     externalAcceptedTraffic.init(router.gptReqPorts.size());
+    bandwidthBlocked.init(router.gpnReqPorts.size());
+    totalInternalTraffic.init(router.gpnReqPorts.size());
+    // internalTrafficVector.init(router.gpnReqPorts.size(), 6000);
 
     for (uint32_t i = 0; i < router.gpnReqPorts.size(); ++i) {
         internalTrafficHist.push_back(new statistics::Histogram(this));
         internalTrafficHist[i]
-            ->init(64)
-            .name(csprintf("internal_traffic_hist"))
-            .desc("");
+            ->init(20000)
+            .name(csprintf("internal_traffic_hist_%i",i))
+            .desc("")
+            .flags(nozero);
+
+        internalPortBW.push_back(new statistics::Formula(this,
+        csprintf("average_internal_BW_%d", i).c_str(),
+        "Internal BW (GB/s)"));
+
+        *internalPortBW[i] =
+            totalInternalTraffic[i] / (simSeconds*1e9);
     }
 }
 }// namespace gem5
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index d6c9650fc9..d0274f86e3 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -148,6 +148,7 @@ class RouterEngine : public ClockedObject
     std::vector<GPNReqPort> gpnReqPorts;
     std::vector<GPNRespPort> gpnRespPorts;
 
+
     std::unordered_map<PortID, AddrRangeList> gptAddrMap;
     std::unordered_map<PortID, AddrRangeList> routerAddrMap;
 
@@ -159,11 +160,17 @@ class RouterEngine : public ClockedObject
 
     std::unordered_map<PortID, Cycles> externalLatency;
     std::unordered_map<PortID, Cycles> internalLatency;
+    std::vector<int> inFlightTraffic;
+    std::vector<int> tokenVector;
+    std::vector<int> sample;
 
     const uint32_t gptQSize;
     const uint32_t gpnQSize;
     bool emptyQueues;
     const Cycles routerLatency;
+    int start;
+    Tick sampleTime;
+    int tokens;
 
     EventFunctionWrapper nextGPTGPNEvent;
     void processNextGPTGPNEvent();
@@ -177,6 +184,9 @@ class RouterEngine : public ClockedObject
     EventFunctionWrapper nextExternalRequestEvent;
     void processNextExternalRequestEvent();
 
+    EventFunctionWrapper nextTrafficTrackEvent;
+    void processNextTrafficTrackEvent();
+
     struct RouterEngineStat : public statistics::Group
     {
       RouterEngineStat(RouterEngine &push);
@@ -189,7 +199,11 @@ class RouterEngine : public ClockedObject
       statistics::Vector externalBlockedTraffic;
       statistics::Vector internalAcceptedTraffic;
       statistics::Vector externalAcceptedTraffic;
+      statistics::Vector bandwidthBlocked;
+      statistics::Vector totalInternalTraffic;
+    //   statistics::Vector2d internalTrafficVector;
       std::vector<statistics::Histogram *> internalTrafficHist;
+      std::vector<statistics::Formula *> internalPortBW;
     };
     RouterEngineStat stats;
   public:

From 334985a273bde6e6b28075ec2d73bf73c9ad9faf Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Fri, 25 Aug 2023 12:19:20 -0700
Subject: [PATCH 286/287] Updating the router

---
 src/accl/graph/sega/router_engine.cc | 12 ++++++------
 src/accl/graph/sega/router_engine.hh |  8 --------
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index 4d433a71fb..a36f28b025 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -92,12 +92,12 @@ RouterEngine::registerCenteralController(
     centeralController = centeral_controller;
 }
 
-void
-RouterEngine::registerCenteralController(
-                                    CenteralController* centeral_controller)
-{
-    centeralController = centeral_controller;
-}
+// void
+// RouterEngine::registerCenteralController(
+//                                     CenteralController* centeral_controller)
+// {
+//     centeralController = centeral_controller;
+// }
 
 AddrRangeList
 RouterEngine::GPTRespPort::getAddrRanges() const
diff --git a/src/accl/graph/sega/router_engine.hh b/src/accl/graph/sega/router_engine.hh
index d0274f86e3..5c06ecc862 100644
--- a/src/accl/graph/sega/router_engine.hh
+++ b/src/accl/graph/sega/router_engine.hh
@@ -209,10 +209,6 @@ class RouterEngine : public ClockedObject
   public:
     PARAMS(RouterEngine);
     RouterEngine(const Params &params);
-<<<<<<< HEAD
-=======
-
->>>>>>> accl: Improving the router latency
     void registerCenteralController(CenteralController* centeral_controller);
     virtual void init() override;
     virtual void startup() override;
@@ -226,10 +222,6 @@ class RouterEngine : public ClockedObject
     void checkGPTRetryReq();
     void checkGPNRetryReq();
     bool done();
-<<<<<<< HEAD
-=======
-
->>>>>>> accl: Improving the router latency
 };
 
 }

From 167542e7c057e2df7eee2fbe1d9c0c4651e687b9 Mon Sep 17 00:00:00 2001
From: Marjan Fariborz <mfariborz@ucdavis.edu>
Date: Sun, 27 Aug 2023 14:42:40 -0700
Subject: [PATCH 287/287] Updating the router_engine.cc

---
 src/accl/graph/sega/router_engine.cc | 7 -------
 1 file changed, 7 deletions(-)

diff --git a/src/accl/graph/sega/router_engine.cc b/src/accl/graph/sega/router_engine.cc
index a36f28b025..e26cc06645 100644
--- a/src/accl/graph/sega/router_engine.cc
+++ b/src/accl/graph/sega/router_engine.cc
@@ -92,13 +92,6 @@ RouterEngine::registerCenteralController(
     centeralController = centeral_controller;
 }
 
-// void
-// RouterEngine::registerCenteralController(
-//                                     CenteralController* centeral_controller)
-// {
-//     centeralController = centeral_controller;
-// }
-
 AddrRangeList
 RouterEngine::GPTRespPort::getAddrRanges() const
 {