From f8a4188033bb814e47e9551eedad22c82d6d3697 Mon Sep 17 00:00:00 2001 From: Han Date: Wed, 29 Nov 2017 15:17:14 -0500 Subject: [PATCH 01/20] feat(baremetal NIC): initial push for baremetal NIC Only works on Intel 82599 family based NICs --- src/IOBuf.h | 3 +- src/native/GeneralPurposeAllocator.h | 20 + src/native/Ixgbe.h | 387 +++++ src/native/IxgbeDriver.cc | 2001 ++++++++++++++++++++++++++ src/native/IxgbeDriver.h | 473 ++++++ src/native/Main.cc | 15 + src/native/Msr.h | 11 + src/native/Net.cc | 5 +- src/native/Net.h | 18 +- src/native/NetIcmp.cc | 16 +- src/native/NetIp.cc | 29 +- src/native/NetTcp.cc | 22 +- src/native/NetUdp.cc | 23 +- src/native/Pci.cc | 59 +- src/native/Pci.h | 5 + src/native/config.cmake | 1 + src/native/config.h.in | 1 + 17 files changed, 3063 insertions(+), 26 deletions(-) create mode 100644 src/native/Ixgbe.h create mode 100644 src/native/IxgbeDriver.cc create mode 100644 src/native/IxgbeDriver.h diff --git a/src/IOBuf.h b/src/IOBuf.h index a405027b..c430f1a3 100644 --- a/src/IOBuf.h +++ b/src/IOBuf.h @@ -7,10 +7,10 @@ #include #include +#include #include #include #include -#include #include @@ -64,6 +64,7 @@ class IOBuf { } void TrimEnd(size_t amount) { length_ -= amount; } + void SetLength(size_t amount) { length_ = amount; } bool IsChained() const { return Next() != this; } diff --git a/src/native/GeneralPurposeAllocator.h b/src/native/GeneralPurposeAllocator.h index 12c91d97..b51869c6 100644 --- a/src/native/GeneralPurposeAllocator.h +++ b/src/native/GeneralPurposeAllocator.h @@ -16,6 +16,26 @@ namespace ebbrt { +// handler used in Pci.cc code to handle faults on multicores when mapping +// device +class MulticorePciFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { + ebbrt::Pfn vpage_; + ebbrt::Pfn ppage_; + size_t size_; + + public: + void SetMap(ebbrt::Pfn va, ebbrt::Pfn pa, size_t s) { + vpage_ = va; + ppage_ = pa; + size_ = s; + } + + void HandleFault(ebbrt::idt::ExceptionFrame* ef, + uintptr_t faulted_address) override { + ebbrt::vmem::MapMemory(vpage_, ppage_, size_); + } +}; + // page fault handler for mapping in physical pages // to virtual pages on all cores class LargeRegionFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h new file mode 100644 index 00000000..1a966ec1 --- /dev/null +++ b/src/native/Ixgbe.h @@ -0,0 +1,387 @@ +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ + +// from https://github.com/cisco-open-source/ethtool/ixgbe.c + +/* Register Bit Masks */ +#define IXGBE_FCTRL_SBP 0x00000002 +#define IXGBE_FCTRL_MPE 0x00000100 +#define IXGBE_FCTRL_UPE 0x00000200 +#define IXGBE_FCTRL_BAM 0x00000400 +#define IXGBE_FCTRL_PMCF 0x00001000 +#define IXGBE_FCTRL_DPF 0x00002000 +#define IXGBE_FCTRL_RPFCE 0x00004000 +#define IXGBE_FCTRL_RFCE 0x00008000 +#define IXGBE_VLNCTRL_VET 0x0000FFFF +#define IXGBE_VLNCTRL_CFI 0x10000000 +#define IXGBE_VLNCTRL_CFIEN 0x20000000 +#define IXGBE_VLNCTRL_VFE 0x40000000 +#define IXGBE_VLNCTRL_VME 0x80000000 +#define IXGBE_LINKS_UP 0x40000000 +#define IXGBE_LINKS_SPEED 0x20000000 +#define IXGBE_SRRCTL_BSIZEPKT_MASK 0x0000007F +#define IXGBE_HLREG0_TXCRCEN 0x00000001 +#define IXGBE_HLREG0_RXCRCSTRP 0x00000002 +#define IXGBE_HLREG0_JUMBOEN 0x00000004 +#define IXGBE_HLREG0_TXPADEN 0x00000400 +#define IXGBE_HLREG0_LPBK 0x00008000 +#define IXGBE_RMCS_TFCE_802_3X 0x00000008 +#define IXGBE_RMCS_TFCE_PRIORITY 0x00000010 +#define IXGBE_FCCFG_TFCE_802_3X 0x00000008 +#define IXGBE_FCCFG_TFCE_PRIORITY 0x00000010 +#define IXGBE_MFLCN_PMCF 0x00000001 /* Pass MAC Control Frames */ +#define IXGBE_MFLCN_DPF 0x00000002 /* Discard Pause Frame */ +#define IXGBE_MFLCN_RPFCE 0x00000004 /* Receive Priority FC Enable */ +#define IXGBE_MFLCN_RFCE 0x00000008 /* Receive FC Enable */ + +enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; + +#define ETHHDR_LEN 14 +#define IPHDR_LEN 20 +#define UDPHDR_LEN 8 + +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + +/*********************** + * RX + * Descriptors + **********************/ +// 7.1.5 Legacy Receive Descriptor, Table 7 - 11 +typedef union { + + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t fragment_checksum : 16; + + // uint64_t status : 8; + uint64_t dd : 1; + uint64_t eop : 1; + uint64_t rsvd1 : 1; + uint64_t vp : 1; + uint64_t udpcs : 1; + uint64_t l4cs : 1; + uint64_t ipcs : 1; + uint64_t pif : 1; + + // uint64_t errors : 8; + uint64_t rxe : 1; + uint64_t rsvd2 : 1; + uint64_t rsvd3 : 1; + uint64_t rsvd4 : 1; + uint64_t rsvd5 : 1; + uint64_t rsvd6 : 1; + uint64_t tcpe : 1; + uint64_t ipe : 1; + + uint64_t vlan_tag : 16; + }; // struct + + }; // union + + } __attribute__((packed)); // struct + +} rdesc_legacy_t; // typedef union + +// 7.1.6.1 Advanced Receive Descriptors Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t packet_buffer; + uint64_t header_buffer; + } __attribute__((packed)); // struct +} rdesc_adv_rf_t; + +// 7.1.6.2 Advanced Receive Descriptors — Write-Back Format +typedef union { + uint64_t raw[2]; + struct { + union { + uint32_t raw32_1; + struct { + uint32_t rss_type : 4; + + // packet type + uint32_t pt_ipv4 : 1; + uint32_t pt_ipv4e : 1; + uint32_t pt_ipv6 : 1; + uint32_t pt_ipv6e : 1; + uint32_t pt_tcp : 1; + uint32_t pt_udp : 1; + uint32_t pt_sctp : 1; + uint32_t pt_nfs : 1; + uint32_t pt_isesp : 1; + uint32_t pt_isah : 1; + uint32_t pt_linksec : 1; + uint32_t pt_l2packet : 1; + uint32_t pt_rsvd : 1; + + uint32_t rsccnt : 4; + uint32_t hdr_len : 10; + uint32_t sph : 1; + }; + }; // union raw32_1 + + union { + uint32_t raw32_2; + uint32_t rss_hash; + uint32_t fragment_checksum; + uint32_t rtt; + uint32_t fcoe_param; + uint32_t flow_directors_filters_id; // may need more, page 317 + }; // union raw32_2 + + union { + uint32_t raw32_3; + + struct { + // extended status + uint32_t dd : 1; + uint32_t eop : 1; + uint32_t flm : 1; + uint32_t vp : 1; + + // fcstat - 2 bits + uint32_t udpcs : 1; + uint32_t l4i : 1; + + uint32_t ipcs : 1; + uint32_t pif : 1; + uint32_t rsvd_1 : 1; + uint32_t vext : 1; + uint32_t udpv : 1; + uint32_t llint : 1; + uint32_t rsvd_2 : 4; + uint32_t ts : 1; + uint32_t secp : 1; + uint32_t lb : 1; + uint32_t rsvd_3 : 1; + + // extended error + uint32_t fdierr : 3; + uint32_t hbo : 1; + uint32_t rsvd : 3; + uint32_t secerr : 2; + uint32_t rxe : 1; + uint32_t l4e : 1; + uint32_t ipe : 1; + }; // status_last_descriptor; + + struct { + // extended status + uint32_t dd2 : 1; + uint32_t eop2 : 1; + uint32_t rsvd_4 : 2; + uint32_t next_descriptor_ptr : 16; + + // extended error + uint32_t error : 12; + }; // status_non_last_descriptor; + }; // union raw32_3 + + union { + uint32_t raw32_4; + struct { + uint32_t pkt_len : 16; + uint32_t vlan_tag : 16; + }; + }; // union raw32_4 + + } __attribute__((packed)); // struct +} rdesc_adv_wb_t; + +/*********************** + * TX + * Descriptors + **********************/ +// 7.2.3.2.2 Legacy Transmit Descriptor Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t cso : 8; + + // cmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t ic : 1; + uint64_t rs : 1; + uint64_t rsvd_1 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t rsvd_2 : 1; + + // sta + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 4; + uint64_t css : 8; + uint64_t vlan : 16; + }; + }; + + } __attribute__((packed)); +} tdesc_legacy_t; + +// 7.2.3.2.3 Advanced Transmit Context Descriptor +typedef union { + uint64_t raw[2]; + + struct { + union { + uint64_t raw_1; + + struct { + uint64_t iplen : 9; + uint64_t maclen : 7; + uint64_t vlan : 16; + uint64_t ipsec_sa_index : 10; + uint64_t fcoef : 6; + uint64_t rsvd_1 : 16; + }; + }; + + union { + uint64_t raw_2; + + struct { + uint64_t ipsec_esp_len : 9; + + // tucmd + uint64_t snap : 1; + uint64_t ipv4 : 1; + uint64_t l4t : 2; // l4 packet type + uint64_t ipsec_type : 1; + uint64_t encyption : 1; + uint64_t fcoe : 1; + uint64_t rsvd_2 : 4; + + uint64_t dytp : 4; + uint64_t rsvd_3 : 5; + uint64_t dext : 1; + + uint64_t bcntlen : 6; + uint64_t idx : 1; + uint64_t rsvd_4 : 3; + uint64_t l4len : 8; + uint64_t mss : 16; + }; + }; + + } __attribute__((packed)); + +} tdesc_advance_ctxt_wb_t; + +// 7.2.3.2.4 Advanced Transmit Data Descriptor - Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t address; + + union { + uint64_t raw2; + struct { + uint64_t dtalen : 16; + uint64_t rsvd_1 : 2; + + // mac + uint64_t mac_ilsec : 1; + uint64_t mac_1588 : 1; + + uint64_t dtyp : 4; + + // dcmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t rsvd_2 : 1; + uint64_t rs : 1; + uint64_t rsvd_3 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t tse : 1; + + // status + uint64_t dd : 1; + uint64_t rsvd_4 : 3; + + // idx + uint64_t idx : 3; + // uint64_t rsvd_5 : 2; + + uint64_t cc : 1; + + // popts + uint64_t ixsm : 1; + uint64_t txsm : 1; + uint64_t ipsec : 1; + uint64_t rsvd_6 : 3; + + uint64_t paylen : 18; + }; + }; + }; + +} tdesc_advance_tx_rf_t; + +// Advanced Transmit Data Descriptor - Write-back Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t rsvd_1; + + union { + uint64_t raw2; + + struct { + uint64_t rsvd_2 : 32; + + // status + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 28; + }; + }; + }; + +} tdesc_advance_tx_wbf_t; + +struct VirtioNetHeader { + static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kGsoNone = 0; + static const constexpr uint8_t kGsoTcpv4 = 1; + static const constexpr uint8_t kGsoUdp = 3; + static const constexpr uint8_t kGsoTcpv6 = 4; + static const constexpr uint8_t kGsoEvn = 0x80; + + uint8_t flags; + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; + uint16_t num_buffers; +}; + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc new file mode 100644 index 00000000..ab28293b --- /dev/null +++ b/src/native/IxgbeDriver.cc @@ -0,0 +1,2001 @@ +// Copyright Boston University SESA Group 2013 - 2018. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#include "IxgbeDriver.h" + +#include "../Align.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Clock.h" +#include "Debug.h" +#include "EventManager.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "Pfn.h" + +#include +#include +#include + +void ebbrt::IxgbeDriver::Create(pci::Device& dev) { + auto ixgbe_dev = new IxgbeDriver(dev); + + // physical device bringup + ixgbe_dev->Init(); + + ixgbe_dev->ebb_ = + IxgbeDriverRep::Create(ixgbe_dev, ebb_allocator->AllocateLocal()); + + // initialize per core rx and tx queues + for (size_t i = 0; i < Cpu::Count(); i++) { + ixgbe_dev->SetupMultiQueue(i); + } + + ixgbe_dev->FinishSetup(); + + // TODO remove? + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("intel 82599 card initialzed\n"); +} + +const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { + return mac_addr_; +} + +void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { + ebb_->Send(std::move(buf), std::move(pinfo)); +} + +void ebbrt::IxgbeDriver::Run() { ebb_->Run(); } + +// After packet transmission, need to mark bit in +// tx queue so that it can be used again +// TX_HEAD_WB does it automatically +void ebbrt::IxgbeDriverRep::ReclaimTx() { +#ifndef TX_HEAD_WB + size_t head = ixgmq_.tx_head_; + size_t tail = ixgmq_.tx_tail_; + tdesc_advance_tx_wbf_t* actx; + + // go through all descriptors owned by HW + while (head != tail) { + actx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); + + // if context + if (ixgmq_.tx_isctx_[head]) { + head = (head + 1) % ixgmq_.tx_size_; + } + // if non eop + else if (!(actx->dd)) { + head = (head + 1) % ixgmq_.tx_size_; + } + // eop + else if (actx->dd) { + head = (head + 1) % ixgmq_.tx_size_; + ixgmq_.tx_head_ = head; + } + } +#endif +} + +// every TX requires a context struct before +void ebbrt::IxgbeDriverRep::AddContext(uint8_t idx, uint8_t maclen, + uint16_t iplen, uint8_t l4len, + enum l4_type l4type) { + + tdesc_advance_ctxt_wb_t* actx; + + auto tail = ixgmq_.tx_tail_; + + // context buffer already allocated, need to zero + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); + + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + + memset(actx, 0, sizeof(tdesc_advance_ctxt_wb_t)); + ixgmq_.tx_isctx_[tail] = true; + + // refer to 82599 datasheet for these settings + actx->dytp = 0b0010; + actx->dext = 1; + actx->idx = idx; + actx->maclen = maclen; + actx->iplen = iplen; + + actx->ipv4 = 1; + actx->l4len = 0; // ignored when TSE not set + actx->l4t = l4type; + + // need to increment tail + ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; +} + +// Add a new packet to be transmitted +void ebbrt::IxgbeDriverRep::AddTx(const uint8_t* pa, uint64_t len, + uint64_t totallen, bool first, bool last, + uint8_t ctx, bool ip_cksum, + bool tcpudp_cksum) { + tdesc_advance_tx_rf_t* actx; + + auto tail = ixgmq_.tx_tail_; + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); + + ixgmq_.tx_isctx_[tail] = false; + + actx->raw[0] = 0x0; + actx->raw[1] = 0x0; + + // pa is physical address of where send buffer exists + actx->address = reinterpret_cast(pa); + actx->dtalen = len; + if (first) { + actx->paylen = totallen; + } + + // type is advanced + actx->dtyp = 0b0011; + actx->dext = 1; + + // rs bit should only be set when eop is set + if (last) { + actx->rs = 1; + } else { + actx->rs = 0; + } + + // checksum + actx->ifcs = 1; + + // set last packet bit + if (last) { + actx->eop = 1; + } else { + actx->eop = 0; + } + + // TODO enable ip checksum + if (ctx != -1) { + actx->idx = ctx; + actx->cc = 1; + actx->ixsm = ip_cksum; // no ip checksum + actx->txsm = tcpudp_cksum; // udp or tcp checksum offload + } + + ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; +} + +void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + auto dp = buf->GetDataPointer(); + auto len = buf->ComputeChainDataLength(); + auto count = buf->CountChainElements(); + bool ip_cksum = false; + bool tcpudp_cksum = false; + + ebbrt::kbugon(len >= 0xA0 * 1000, + "%s packet len bigger than max ether length\n", __FUNCTION__); + +// TODO threshold for triggering reclaim tx buffers +#ifndef TX_HEAD_WB + size_t free_desc = + IxgbeDriver::NTXDESCS - + (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); + // free descripts must have enough for count in chained iobufs + if (free_desc < (count + 1)) { + // reclaim buffers + ReclaimTx(); + + free_desc = IxgbeDriver::NTXDESCS - + (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); + // not enough descriptors got freed + if (free_desc < (count + 1)) { + return; + } + } +#endif + + if (pinfo.flags & PacketInfo::kNeedsIpCsum) { + ip_cksum = true; + } + + // NEED CHECKSUM + if (pinfo.flags & PacketInfo::kNeedsCsum) { + tcpudp_cksum = true; + + // check datasheet for numbers + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + } else if (pinfo.csum_offset == 16) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); + } else { + ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } + + // if buffer is chained + if (buf->IsChained()) { + size_t counter = 0; + for (auto& buf_it : *buf) { + counter++; + + // first buffer + if (counter == 1) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, + true, false, 0, ip_cksum, tcpudp_cksum); + } else { + // last buffer + if (counter == count) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, true, 0, ip_cksum, tcpudp_cksum); + } else { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, false, 0, ip_cksum, tcpudp_cksum); + } + } + } + } + // not chained + else { + AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); + } + } else { + // NO CHECKSUM FLAG SET + // if buffer is chained + if (buf->IsChained()) { + size_t counter = 0; + for (auto& buf_it : *buf) { + counter++; + + // first buffer + if (counter == 1) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, + true, false, 0, ip_cksum, tcpudp_cksum); + } else { + // last buffer + if (counter == count) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, true, 0, ip_cksum, tcpudp_cksum); + } else { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, false, 0, ip_cksum, tcpudp_cksum); + } + } + } + } + // not chained + else { + AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); + } + } + + // bump tx_tail + // indicates position beyond last descriptor hw + WriteTdt_1(Cpu::GetMine(), ixgmq_.tx_tail_); +} + +void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { + // Disable RXCTRL - 8.2.3.8.10 + bar0_.Write32(0x03000, m); +} + +void ebbrt::IxgbeDriver::WriteDmatxctl(uint32_t m) { + uint32_t reg; + + reg = bar0_.Read32(0x04A80); + ebbrt::kprintf("0x04A80: DMATXCTL 0x%08X - reset to 0x%08X\n", reg, reg & m); + + // DMATXCTL - 8.2.3.9.2 + bar0_.Write32(0x04A80, reg & m); +} +void ebbrt::IxgbeDriver::WriteDmatxctl_te(uint32_t m) { + auto reg = bar0_.Read32(0x04A80); + bar0_.Write32(0x04A80, reg | m); +} + +// 8.2.3.5.18 - General Purpose Interrupt Enable — GPIE (0x00898; RW) +void ebbrt::IxgbeDriver::WriteGpie(uint32_t m) { + auto reg = bar0_.Read32(0x00898); + bar0_.Write32(0x00898, reg | m); +} + +// 8.2.3.5.1 Extended Interrupt Cause Register- EICR (0x00800; RW1C) +void ebbrt::IxgbeDriver::ReadEicr() { + /* Note + * The EICR is also cleared on read if GPIE.OCD bit is cleared. When the + * GPIE.OCD bit is set, then only bits 16...29 are cleared on read. + */ + // 8.2.3.5.18 General Purpose Interrupt Enable — GPIE (0x00898;RW) + uint32_t reg; + reg = bar0_.Read32(0x00898); + ebbrt::kbugon((reg & 0x20), "GPIE.OCD not cleared\n"); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("First Read - 0x00800: EICR 0x%08X, ", reg); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("Second Read - EICR 0x%08X\n", reg); +} +void ebbrt::IxgbeDriver::WriteEicr(uint32_t m) { + auto reg = bar0_.Read32(0x00800); + bar0_.Write32(0x00800, reg | m); +} + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +uint32_t ebbrt::IxgbeDriver::ReadEims() { return bar0_.Read32(0x00880); } +void ebbrt::IxgbeDriver::WriteEims(uint32_t m) { bar0_.Write32(0x00880, m); } + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriver::WriteEimc(uint32_t m) { bar0_.Write32(0x00888, m); } + +// 8.2.3.5.5 Extended Interrupt Auto Clear Register — EIAC (0x00810; RW) +void ebbrt::IxgbeDriver::WriteEiac(uint32_t m) { + auto reg = bar0_.Read32(0x00810); + bar0_.Write32(0x00810, reg | m); +} + +// 8.2.3.5.8 Extended Interrupt Mask Set/Read Registers — EIMS[n] (0x00AA0 + +// 4*(n-1), n=1...2; RWS) +void ebbrt::IxgbeDriver::WriteEimsn(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00AA0 + 4 * n); + bar0_.Write32(0x00AA0 + 4 * n, reg | m); +} + +// 8.2.3.5.12 +// Extended Interrupt Throttle Registers — EITR[n] +// (0x00820 + 4*n, n=0...23 and 0x012300 + 4*(n-24), +// n=24...128; RW) +void ebbrt::IxgbeDriver::WriteEitr(uint32_t n, uint32_t m) { + ebbrt::kbugon(n > 128, "%s error\n", __FUNCTION__); + + if (n < 24) { + bar0_.Write32(0x00820 + 4 * n, m); + } else { + bar0_.Write32(0x012300 + 4 * (n - 24), m); + } +} + +// 8.2.3.9.10 Transmit Descriptor Control — TXDCTL[n] (0x06028+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTxdctl(uint32_t n, uint32_t m) { + bar0_.Write32(0x06028 + (0x40 * n), m); +} +uint8_t ebbrt::IxgbeDriver::ReadTxdctl_enable(uint32_t n) { + auto reg = bar0_.Read32(0x06028 + 0x40 * n); + return (reg >> 25) & 0x1; +} + +// 8.2.3.8.6 Receive Descriptor Control — RXDCTL[n] (0x01028 + +// 0x40*n, n=0...63 and 0x0D028 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRxdctl_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01028 + (0x40 * n), m); +} +void ebbrt::IxgbeDriver::WriteRxdctl_1_enable(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + bar0_.Write32(0x01028 + (0x40 * n), reg | m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRxdctl_1_enable(uint32_t n) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + return (reg >> 25) & 0x1; +} + +void ebbrt::IxgbeDriver::WriteRxdctl_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D028 + (0x40 * n), m); +} + +// 8.2.3.27.14 PF VM L2 Control Register — PFVML2FLT[n] (0x0F000 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvml2flt(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F000 + 4 * n, m); +} + +// 8.2.3.9.14 Manageability Transmit TC Mapping — MNGTXMAP (0x0CD10; RW) +void ebbrt::IxgbeDriver::WriteMngtxmap(uint32_t m) { + bar0_.Write32(0x0CD10, m); +} + +// 8.2.3.1.1 Device Control Register — CTRL (0x00000 / 0x00004;RW) +void ebbrt::IxgbeDriver::WriteCtrl(uint32_t m) { bar0_.Write32(0x0, m); } +void ebbrt::IxgbeDriver::ReadCtrl() { + uint32_t reg; + reg = bar0_.Read32(0x0); + ebbrt::kprintf("%s = 0x%X\n", __FUNCTION__, reg); +} + +// 8.2.3.1.3 Extended Device Control Register — CTRL_EXT (0x00018; RW) +void ebbrt::IxgbeDriver::WriteCtrlExt(uint32_t m) { + auto reg = bar0_.Read32(0x00018); + bar0_.Write32(0x00018, reg | m); +} + +// 8.2.3.7.1 Filter Control Register — FCTRL (0x05080; RW) +void ebbrt::IxgbeDriver::WriteFctrl(uint32_t m) { bar0_.Write32(0x05080, m); } + +// 8.2.3.24.9 Flexible Host Filter Table Registers — FHFT (0x09000 — 0x093FC and +// 0x09800 — 0x099FC; RW) +void ebbrt::IxgbeDriver::WriteFhft_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x09000, m); +} +void ebbrt::IxgbeDriver::WriteFhft_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x09800, m); +} + +// 8.2.3.1.2 Device Status Register — STATUS (0x00008; RO) +bool ebbrt::IxgbeDriver::ReadStatusPcieMes() { + auto reg = bar0_.Read32(0x8); + return !(reg & 0x80000); +} +uint8_t ebbrt::IxgbeDriver::ReadStatusLanId() { + auto reg = bar0_.Read32(0x8); + return (reg >> 2) & 0x3; +} + +// 8.2.3.3.2 Flow Control Transmit Timer Value n — FCTTVn (0x03200 + 4*n, +// n=0...3; RW) +void ebbrt::IxgbeDriver::WriteFcttv(uint32_t n, uint32_t m) { + bar0_.Write32(0x03200 + (4 * n), m); +} + +// 8.2.3.3.3 Flow Control Receive Threshold Low — FCRTL[n] (0x03220 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrtl(uint32_t n, uint32_t m) { + bar0_.Write32(0x03220 + (4 * n), m); +} + +// 8.2.3.3.4 Flow Control Receive Threshold High — FCRTH[n] (0x03260 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrth(uint32_t n, uint32_t m) { + bar0_.Write32(0x03260 + (4 * n), m); +} + +// 8.2.3.3.5 Flow Control Refresh Threshold Value — FCRTV (0x032A0; RW) +void ebbrt::IxgbeDriver::WriteFcrtv(uint32_t m) { bar0_.Write32(0x032A0, m); } + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { bar0_.Write32(0x03D00, m); } + +// 8.2.3.2.2 EEPROM Read Register — EERD (0x10014; RW) +void ebbrt::IxgbeDriver::WriteEerd(uint32_t m) { bar0_.Write32(0x10014, m); } +bool ebbrt::IxgbeDriver::ReadEerdDone() { + auto reg = bar0_.Read32(0x10014); + return !!(reg & 0x2); // return true when Read Done = 1 +} + +uint16_t ebbrt::IxgbeDriver::ReadEerdData() { + auto reg = bar0_.Read32(0x10014); + return (reg >> 16) & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadEeprom(uint16_t offset) { + WriteEerd(offset << 2 | 1); + // TODO: Timeout + while (ReadEerdDone() == 0) + ; + return ReadEerdData(); +} + +// 8.2.3.22.32 - Core Analog Configuration Register — CoreCTL (0x014F00; RW) +void ebbrt::IxgbeDriver::WriteCorectl(uint16_t m) { + bar0_.Write32(0x014F00, 0x0 | m); +} + +// 8.2.3.22.19 Auto Negotiation Control Register — AUTOC (0x042A0; RW) +void ebbrt::IxgbeDriver::WriteAutoc(uint32_t m) { + auto reg = bar0_.Read32(0x042A0); + bar0_.Write32(0x042A0, reg | m); +} +uint8_t ebbrt::IxgbeDriver::ReadAutocRestartAn() { + auto reg = bar0_.Read32(0x042A0); + return (reg >> 12) & 0x1; +} + +// 8.2.3.22.23 Auto Negotiation Link Partner Link Control Word 1 Register — +// ANLP1 (0x042B0; RO) +uint8_t ebbrt::IxgbeDriver::ReadAnlp1() { + auto reg = bar0_.Read32(0x042B0); + return (reg >> 16) & 0xFF; +} + +// 8.2.3.2.1 EEPROM/Flash Control Register — EEC (0x10010; RW) +uint8_t ebbrt::IxgbeDriver::ReadEecAutoRd() { + auto reg = bar0_.Read32(0x10010); + return (reg >> 9) & 0xFF; +} + +// 8.2.3.7.7 Multicast Table Array — MTA[n] (0x05200 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteMta(uint32_t n, uint32_t m) { + bar0_.Write32(0x05200 + (4 * n), m); +} + +// 8.2.3.7.11 VLAN Filter Table Array — VFTA[n] (0x0A000 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteVfta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A000 + (4 * n), m); +} + +// 8.2.3.27.15 PF VM VLAN Pool Filter — PFVLVF[n] (0x0F100 + 4*n, n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvlvf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F100 + 4 * n, m); +} + +// 8.2.3.27.16 PF VM VLAN Pool Filter Bitmap — PFVLVFB[n] (0x0F200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfvlvfb(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F200 + 4 * n, m); +} + +// 8.2.3.7.23 Rx Filter ECC Err Insertion 0 — RXFECCERR0 (0x051B8; RW) +void ebbrt::IxgbeDriver::WriteRxfeccerr0(uint32_t m) { + auto reg = bar0_.Read32(0x051B8); + bar0_.Write32(0x051B8, reg | m); +} + +// Checks the MAC's EEPROM to see if it supports a given SFP+ module type, if +// 1360 +// so it returns the offsets to the phy init sequence block. +// also based on +// http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c?v=3.14#L1395 +// https://github.com/freebsd/freebsd/blob/386ddae58459341ec567604707805814a2128a57/sys/dev/ixgbe/ixgbe_82599.c#L173 +void ebbrt::IxgbeDriver::PhyInit() { + + uint16_t list_offset; + uint16_t data_offset = 0x0; + uint16_t data_value; + uint16_t sfp_id; + uint16_t sfp_type = 0x4; /* SPF_DA_CORE1 */ + + /* IXGBE_PHY_INIT_OFFSET_NL */ + list_offset = ReadEeprom(0x002B); + + if ((list_offset == 0x0) || (list_offset == 0xFFFF)) { + return; + } + + /* Shift offset to first ID word */ + list_offset++; + + sfp_id = ReadEeprom(list_offset); + + while (sfp_id != 0xFFFF) { + if (sfp_id == sfp_type) { + list_offset++; + data_offset = ReadEeprom(list_offset); + if ((data_offset == 0x0) || (data_offset == 0xFFFF)) { + ebbrt::kprintf("sfp init failed\n"); + return; + } else { + break; + } + } else { + list_offset += 2; + sfp_id = ReadEeprom(list_offset); + } + list_offset++; + } + + if (sfp_id == 0xFFFF) { + ebbrt::kprintf("sfp init failed\n"); + return; + } + + ebbrt::kprintf("data offset -> 0x%x\n", data_offset); + + SwfwLockPhy(); + + data_value = ReadEeprom(++data_offset); + while (data_value != 0xFFFF) { + ebbrt::kprintf("data_value -> 0x%x\n", data_value); + WriteCorectl(data_value); + data_value = ReadEeprom(++data_offset); + } + SwfwUnlockPhy(); + + ebbrt::clock::SleepMilli(20); + + WriteAutoc(0x0 << 13 | 0x1 << 12); + while (ReadAnlp1() != 0) + ; // TODO: timeout + + WriteAutoc(0x3 << 13 | 0x1 << 12); + while (ReadAutocRestartAn() != 0) + ; // TODO: timeout + + ebbrt::kprintf("PHY init done\n"); +} + +// 8.2.3.7.8 Receive Address Low — RAL[n] (0x0A200 + 8*n, n=0...127; RW) +uint32_t ebbrt::IxgbeDriver::ReadRal(uint32_t n) { + auto reg = bar0_.Read32(0x0A200 + 8 * n); + return reg; +} +void ebbrt::IxgbeDriver::WriteRal(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A200 + (8 * n), m); +} + +// 8.2.3.7.9 Receive Address High — RAH[n] (0x0A204 + 8*n, n=0...127; RW) +uint16_t ebbrt::IxgbeDriver::ReadRah(uint32_t n) { + auto reg = bar0_.Read32(0x0A204 + 8 * n); + return (reg)&0xFFFF; +} +uint8_t ebbrt::IxgbeDriver::ReadRahAv(uint32_t n) { + return (bar0_.Read32(0x0A204 + 8 * n) >> 31) & 0xFF; +} +void ebbrt::IxgbeDriver::WriteRah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A204 + (8 * n), m); +} + +// 8.2.3.7.10 MAC Pool Select Array — MPSAR[n] (0x0A600 + 4*n, n=0...255; RW) +void ebbrt::IxgbeDriver::WriteMpsar(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A600 + 4 * n, m); +} + +// 8.2.3.7.19 Five tuple Queue Filter — FTQF[n] (0x0E600 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteFtqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E600 + 4 * n, m); +} + +// 8.2.3.7.16 Source Address Queue Filter — SAQF[n] (0x0E000 + 4*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteSaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E000 + 4 * n, m); +} + +// 8.2.3.7.17 Destination Address Queue Filter — DAQF[n] (0x0E200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E200 + 4 * n, m); +} + +// 8.2.3.7.18 Source Destination Port Queue Filter — SDPQF[n] (0x0E400 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteSdpqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E400 + 4 * n, m); +} + +// 8.2.3.27.17 PF Unicast Table Array — PFUTA[n] (0x0F400 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfuta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F400 + 4 * n, m); +} + +// 8.2.3.7.3 Multicast Control Register — MCSTCTRL (0x05090; RW) +void ebbrt::IxgbeDriver::WriteMcstctrl(uint32_t m) { + auto reg = bar0_.Read32(0x05090); + bar0_.Write32(0x05090, reg & m); +} + +// 8.2.3.10.13 DCB Transmit Descriptor Plane Queue Select — RTTDQSEL (0x04904; +// RW) +void ebbrt::IxgbeDriver::WriteRttdqsel(uint32_t m) { + auto reg = bar0_.Read32(0x04904); + bar0_.Write32(0x04904, reg | m); +} + +// 8.2.3.10.14 DCB Transmit Descriptor Plane T1 Config — RTTDT1C (0x04908; RW) +void ebbrt::IxgbeDriver::WriteRttdt1c(uint32_t m) { bar0_.Write32(0x04908, m); } + +// 8.2.3.10.16 DCB Transmit Rate-Scheduler Config — RTTBCNRC (0x04984; RW) +void ebbrt::IxgbeDriver::WriteRttbcnrc(uint32_t m) { + bar0_.Write32(0x04984, m); +} + +// 8.2.3.10.9 DCB Transmit Descriptor Plane T2 Config - RTTDT2C[n] (0x04910 + +// 4*n, n=0...7; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x04910 + 4 * n, m); +} + +// 8.2.3.10.10 DCB Transmit Packet Plane T2 Config — RTTPT2C[n] (0x0CD20 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRttpt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CD20 + 4 * n, m); +} + +// 8.2.3.10.6 DCB Receive Packet Plane T4 Config — RTRPT4C[n] (0x02140 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRtrpt4c(uint32_t n, uint32_t m) { + bar0_.Write32(0x02140 + 4 * n, m); +} + +// 8.2.3.10.1 DCB Receive Packet Plane Control and Status — RTRPCS (0x02430; RW) +void ebbrt::IxgbeDriver::WriteRtrpcs(uint32_t m) { bar0_.Write32(0x02430, m); } + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg & m); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl_1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} + +// void ebbrt::IxgbeDriver::WriteDcaRxctrl_1_RxdataWrro(uint32_t n, uint32_t m); +void ebbrt::IxgbeDriver::WriteDcaRxctrl_2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0D00C + 0x40 * n); + bar0_.Write32(0x0D00C + 0x40 * n, reg & m); +} + +// 8.2.3.7.5 Receive Checksum Control — RXCSUM (0x05000; RW) +void ebbrt::IxgbeDriver::WriteRxcsum(uint32_t m) { + auto reg = bar0_.Read32(0x05000); + bar0_.Write32(0x05000, reg | m); +} + +// 8.2.3.8.13 RSC Control — RSCCTL[n] (0x0102C + 0x40*n, n=0...63 +// and 0x0D02C + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRscctl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0102C + 0x40 * n); + bar0_.Write32(0x0102C + 0x40 * n, reg | m); +} + +// 8.2.3.7.4 Packet Split Receive Type Register — PSRTYPE[n] +// (0x0EA00 + 4*n, n=0...63 / 0x05480 + 4*n, n=0...15; RW) +void ebbrt::IxgbeDriver::WritePsrtype(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0EA00 + 0x40 * n); + bar0_.Write32(0x0EA00 + 0x40 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { + bar0_.Write32(0x0EA00 + 0x40 * n, 0x0); +} + +// 8.2.3.7.15 Redirection Table — RETA[n] (0x0EB00 + 4*n, n=0...31/ 0x05C00 + +// 4*n, n=0...31; RW) +void ebbrt::IxgbeDriver::WriteReta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0EB00 + 4 * n, m); +} + +// 8.2.3.7.6 Receive Filter Control Register — RFCTL (0x05008; RW) +void ebbrt::IxgbeDriver::WriteRfctl(uint32_t m) { bar0_.Write32(0x05008, m); } + +// 8.2.3.9.16 Tx Packet Buffer Threshold — +// TXPBTHRESH (0x04950 +0x4*n, n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbthresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.7.12 Multiple Receive Queues Command Register- MRQC (0x0EC80 / 0x05818; +// RW) +void ebbrt::IxgbeDriver::WriteMrqc(uint32_t m) { + auto reg = bar0_.Read32(0x0EC80); + bar0_.Write32(0x0EC80, reg | m); +} + +// 8.2.3.9.15 Multiple Transmit Queues Command Register — MTQC (0x08120; RW) +void ebbrt::IxgbeDriver::WriteMtqc(uint32_t m) { bar0_.Write32(0x08120, m); } + +// 8.2.3.27.1 VT Control Register — PFVTCTL (0x051B0; RW) +void ebbrt::IxgbeDriver::WritePfvtctl(uint32_t m) { bar0_.Write32(0x051B0, m); } + +// 8.2.3.10.4 DCB Receive User Priority to Traffic Class — RTRUP2TC (0x03020; +// RW) +void ebbrt::IxgbeDriver::WriteRtrup2tc(uint32_t m) { + bar0_.Write32(0x03020, m); +} + +// 8.2.3.10.5 DCB Transmit User Priority to Traffic Class — RTTUP2TC (0x0C800; +// RW) +void ebbrt::IxgbeDriver::WriteRttup2tc(uint32_t m) { + bar0_.Write32(0x0C800, m); +} + +// 8.2.3.9.1 DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ (0x08100; RW) +void ebbrt::IxgbeDriver::WriteDtxmxszrq(uint32_t m) { + auto reg = bar0_.Read32(0x08100); + bar0_.Write32(0x08100, reg | m); +} + +// 8.2.3.27.9 PF PF Queue Drop Enable Register — PFQDE (0x02F04; RW) +void ebbrt::IxgbeDriver::WritePfqde(uint32_t m) { bar0_.Write32(0x02F04, m); } + +// 8.2.3.22.34 MAC Flow Control Register — MFLCN (0x04294; RW) +void ebbrt::IxgbeDriver::WriteMflcn(uint32_t m) { + auto reg = bar0_.Read32(0x04294); + bar0_.Write32(0x04294, reg | m); +} + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +/*void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { + auto reg = bar0_.Read32(0x03D00); + bar0_.Write32(0x03D00, reg | m); + }*/ + +// void ebbrt::IxgbeDriver::WriteDcaRxctrl_2_RxdataWrro(uint32_t n, uint32_t m); + +// 8.2.3.4.9 - Software Semaphore Register — SWSM (0x10140; RW) +bool ebbrt::IxgbeDriver::SwsmSmbiRead() { + return !!(bar0_.Read32(0x10140) & 0x1); +} +bool ebbrt::IxgbeDriver::SwsmSwesmbiRead() { + return !(bar0_.Read32(0x10140) & 0x2); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiSet() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg | 0x2); + bar0_.Write32(0x10140, reg | 0x2); +} +void ebbrt::IxgbeDriver::SwsmSmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFE); + bar0_.Write32(0x10140, reg & 0xFFFFFFFE); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFD); + bar0_.Write32(0x10140, reg & 0xFFFFFFFD); +} + +// 8.2.3.22.20 Link Status Register — LINKS (0x042A4; RO) +bool ebbrt::IxgbeDriver::ReadLinksLinkUp() { + auto reg = bar0_.Read32(0x042A4); + return ((reg >> 30) & 0x1) == 1; +} + +// 8.2.3.4.11 Software-Firmware Synchronization - SW_FW_SYNC (0x10160; RW) +uint32_t ebbrt::IxgbeDriver::ReadSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + return (reg & m) & 0x3FF; // masking bits 9:0 +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg | m); +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits2(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg & m); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), // n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) // n=64...127 / 0x02200 + 4*n, +// [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteDcaRxctrlClear(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} + +// 8.2.3.11.4 DCA Control Register — DCA_CTRL (0x11074; RW) +void ebbrt::IxgbeDriver::WriteDcaCtrl(uint32_t m) { + auto reg = bar0_.Read32(0x11074); + bar0_.Write32(0x11074, reg | m); +} + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg | m); +} + +// 8.2.3.8.1 Receive Descriptor Base Address Low — RDBAL[n] (0x01000 + 0x40*n, +// n=0...63 and 0x0D000 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbal_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01000 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbal_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D000 + 0x40 * n, m); +} + +// 8.2.3.8.2 Receive Descriptor Base Address High — RDBAH[n] (0x01004 + 0x40*n, +// n=0...63 and 0x0D004 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbah_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01004 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbah_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D004 + 0x40 * n, m); +} + +// 8.2.3.9.5 Transmit Descriptor Base Address Low — TDBAL[n] (0x06000+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06000 + 0x40 * n, m); +} + +// 8.2.3.9.6 Transmit Descriptor Base Address High — TDBAH[n] (0x06004+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x06004 + 0x40 * n, m); +} + +// 8.2.3.9.7 Transmit Descriptor Length — TDLEN[n] (0x06008+0x40*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteTdlen(uint32_t n, uint32_t m) { + bar0_.Write32(0x06008 + 0x40 * n, m); +} + +// 8.2.3.9.8 Transmit Descriptor Head — TDH[n] (0x06010+0x40*n, n=0...127; RO) +void ebbrt::IxgbeDriver::WriteTdh(uint32_t n, uint32_t m) { + bar0_.Write32(0x06010 + 0x40 * n, m); +} +uint16_t ebbrt::IxgbeDriver::ReadTdh(uint32_t n) { + auto reg = bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} + +// 8.2.3.9.11 Tx Descriptor Completion Write Back Address Low — +// TDWBAL[n] (0x06038+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06038 + 0x40 * n, m); +} +// 8.2.3.9.12 Tx Descriptor Completion Write Back Address High — +// TDWBAH[n] (0x0603C+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0603C + 0x40 * n, m); +} + +// 8.2.3.9.9 Transmit Descriptor Tail — TDT[n] (0x06018+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdt(uint32_t n, uint32_t m) { + bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.8.3 Receive Descriptor Length — RDLEN[n] (0x01008 + 0x40*n, n=0...63 +// and 0x0D008 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdlen_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01008 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdlen_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D008 + 0x40 * n, m); +} + +// 8.2.3.8.7 Split Receive Control Registers — SRRCTL[n] (0x01014 + 0x40*n, +// n=0...63 and 0x0D014 + 0x40*(n-64), n=64...127 / 0x02100 + 4*n, [n=0...15]; +// RW) +void ebbrt::IxgbeDriver::WriteSrrctl_1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteSrrctlZero(uint32_t n) { + bar0_.Write32(0x01014 + 0x40 * n, 0x0); +} + +// 8.2.3.8.12 RSC Data Buffer Control Register — RSCDBU (0x03028; RW) +void ebbrt::IxgbeDriver::WriteRscdbu(uint32_t m) { + auto reg = bar0_.Read32(0x03028); + bar0_.Write32(0x03028, reg | m); +} + +void ebbrt::IxgbeDriver::WriteSrrctl_1_desctype(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, reg & m); +} + +// 8.2.3.8.8 Receive DMA Control Register — RDRXCTL (0x02F00; RW) +void ebbrt::IxgbeDriver::WriteRdrxctl(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg | m); +} + +void ebbrt::IxgbeDriver::WriteRdrxctlRSCFRSTSIZE(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg & m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRdrxctlDmaidone() { + auto reg = bar0_.Read32(0x02F00); + return (reg >> 3) & 0x1; +} + +// 8.2.3.8.9 Receive Packet Buffer Size — RXPBSIZE[n] (0x03C00 + 4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteRxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x03C00 + 4 * n, m); +} + +// 8.2.3.9.13 Transmit Packet Buffer Size — TXPBSIZE[n] (0x0CC00 + 0x4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CC00 + 0x4 * n, m); +} + +// 8.2.3.9.16 Tx Packet Buffer Threshold — TXPBTHRESH (0x04950+0x4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteTxpbThresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.22.8 MAC Core Control 0 Register — HLREG0 (0x04240; RW) +void ebbrt::IxgbeDriver::WriteHlreg0(uint32_t m) { + auto reg = bar0_.Read32(0x04240); + bar0_.Write32(0x04240, reg | m); +} + +// 8.2.3.8.5 Receive Descriptor Tail — RDT[n] (0x01018 + 0x40*n, n=0...63 and +// 0x0D018 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdt_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01018 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdt_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D018 + 0x40 * n, m); +} + +// 8.2.3.8.4 Receive Descriptor Head — RDH[n] (0x01010 + 0x40*n, n=0...63 and +// 0x0D010 + 0x40*(n-64), n=64...127; RO) +void ebbrt::IxgbeDriver::WriteRdh_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01010 + 0x40 * n, m); +} +void ebbrt::IxgbeDriverRep::WriteRdh_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01010 + 0x40 * n, m); +} + +uint16_t ebbrt::IxgbeDriver::ReadRdh_1(uint32_t n) { + auto reg = bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadRdt_1(uint32_t n) { + auto reg = bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriver::SwfwSemRelease() { + SwsmSwesmbiClear(); + SwsmSmbiClear(); + ebbrt::kprintf("%s\n", __FUNCTION__); +} + +// 8.2.3.5.16 Interrupt Vector Allocation Registers — IVAR[n] (0x00900 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WriteIvarAlloc0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +// 8.2.3.10.2 DCB Transmit Descriptor Plane Control and Status — RTTDCS +// (0x04900; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdcs(uint32_t m) { + auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, reg | m); +} +void ebbrt::IxgbeDriver::WriteRttdcsArbdisEn(uint32_t m) { + auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, reg & m); +} + +// 8.2.3.10.3 DCB Transmit Packet Plane Control and Status- RTTPCS (0x0CD00; RW) +void ebbrt::IxgbeDriver::WriteRttpcs(uint32_t m) { bar0_.Write32(0x0CD00, m); } + +// 8.2.3.12.5 Security Rx Control — SECRXCTRL (0x08D00; RW) +void ebbrt::IxgbeDriver::WriteSecrxctrl_Rx_Dis(uint32_t m) { + auto reg = bar0_.Read32(0x08D00); + if (m) { + bar0_.Write32(0x08D00, reg | m); + } else { + bar0_.Write32(0x08D00, reg & ~(0x1 << 1)); + } +} + +// 8.2.3.12.6 Security Rx Status — SECRXSTAT (0x08D04; RO) +uint8_t ebbrt::IxgbeDriver::ReadSecrxstat_Sr_Rdy() { + auto reg = bar0_.Read32(0x08D04); + return reg & 0x1; +} + +// 8.2.3.23.59 Total Packets Received — TPR (0x040D0; RC) +uint32_t ebbrt::IxgbeDriver::ReadTpr() { + auto reg = bar0_.Read32(0x040D0); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +// 8.2.3.23.26 Good Packets Received Count — GPRC (0x04074; RO) +uint32_t ebbrt::IxgbeDriver::ReadGprc() { + auto reg = bar0_.Read32(0x04074); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +bool ebbrt::IxgbeDriver::SwfwSemAcquire() { + // polls SWSM.SMBI until 0b is read or timeout + // TODO: timeout after 10 ms + while (SwsmSmbiRead()) + ; + + // writes 1b to SWSM.SWESMBI bit + SwsmSwesmbiSet(); + + // polls SWSM.SWESMBI bit until read as 1b + // TODO: timeout of 3 secs + while (SwsmSwesmbiRead()) + ; + + return true; +} + +// 10.5.4 Software and Firmware Synchronization +bool ebbrt::IxgbeDriver::SwfwLockPhy() { + bool good = false; + +again: + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if ((ReadStatusLanId() == 0) && (ReadSwfwSyncSmBits(0x2) == 0) // SW_PHY_SM0 + && (ReadSwfwSyncSmBits(0x40) == 0)) // FW_PHY_SM0 + { + WriteSwfwSyncSmBits(0x2); // SW_PHY_SM0 + ebbrt::kprintf("SW_PHY_SMO written\n"); + good = true; + } else if ((ReadSwfwSyncSmBits(0x4) == 0) // SW_PHY_SM1 + && (ReadSwfwSyncSmBits(0x80) == 0)) // FW_PHY_SM1 + { + WriteSwfwSyncSmBits(0x4); // SW_PHY_SM1 + ebbrt::kprintf("SW_PHY_SM1 written\n"); + good = true; + } + + SwfwSemRelease(); + + if (!good) { + ebbrt::kprintf("%s: failed, trying again\n", __FUNCTION__); + ebbrt::clock::SleepMilli(20); + goto again; + } + + return true; +} +void ebbrt::IxgbeDriver::SwfwUnlockPhy() { + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if (ReadStatusLanId() == 0) { + WriteSwfwSyncSmBits2(~0x2); // SW_PHY_SM0 + } else { + WriteSwfwSyncSmBits2(~0x4); // SW_PHY_SM1 + } + + SwfwSemRelease(); + + ebbrt::clock::SleepMilli(10); +} + +void ebbrt::IxgbeDriver::StopDevice() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + // disable rx + WriteRxctrl(0x0); + + // disable tx + WriteDmatxctl(0xFFFFFFFE); + + // disable interrupts + WriteEimc(0x7FFFFFFF); + ReadEicr(); + + // disable each rx and tx queue + for (auto i = 0; i < 128; i++) { + // Bit 26, transmit software flush + WriteTxdctl(i, 0x04000000); + + if (i < 64) { + WriteRxdctl_1(i, 0x0); + } else { + WriteRxdctl_2(i - 64, 0x0); + } + } + + // from arrakis + ebbrt::clock::SleepMilli(2); + + // Master disable procedure + WriteCtrl(0x4); // PCIe Master Disable + while (ReadStatusPcieMes() != 1) + ; + ebbrt::kprintf("Ixgbe 82599 stop done\n"); +} + +void ebbrt::IxgbeDriver::GlobalReset() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + WriteCtrl(0x8); // Link Reset + WriteCtrl(0x4000000); // Device Reset + + // Note: To ensure that a global device reset has fully completed and that the + // 82599 responds to subsequent accesses, programmers must wait + // before approximately 1 ms after setting attempting to check + // if the bit has cleared or to access (read or write) any other device + // register. + ebbrt::clock::SleepMilli(2); + ReadCtrl(); +} + +/** + * ixgbe_init_hw_generic - Generic hardware initialization + * @hw: pointer to hardware structure + * + * Initialize the hardware by resetting the hardware, filling the bus info + * structure and media type, clears all on chip counters, initializes receive + * address registers, multicast table, VLAN filter table, calls routine to set + * up link and flow control settings, and leaves transmit and receive units + * disabled and uninitialized + **/ +void ebbrt::IxgbeDriver::Init() { + uint64_t d_mac; + + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + bar0_.Map(); // allocate virtual memory + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("Sleep 200 ms\n"); + + StopDevice(); + GlobalReset(); + ebbrt::clock::SleepMilli(50); + GlobalReset(); + ebbrt::clock::SleepMilli(250); + + // disable interrupts + WriteEimc(0x7FFFFFFF); + ReadEicr(); + + // Let firmware know we have taken over + WriteCtrlExt(0x1 << 28); // DRV_LOAD + + // No snoop disable from FreeBSD ?? + WriteCtrlExt(0x1 << 16); // NS_DIS + + // Initialize flow-control registers + for (auto i = 0; i < 8; i++) { + if (i < 4) { + WriteFcttv(i, 0x0); + } + WriteFcrtl(i, 0x0); + WriteFcrth(i, 0x0); + } + + WriteFcrtv(0x0); + WriteFccfg(0x0); + + // Initialize Phy + PhyInit(); + + // Wait for EEPROM auto read + while (ReadEecAutoRd() == 0) { + }; // TODO: Timeout + ebbrt::kprintf("EEPROM auto read done\n"); + + ebbrt::clock::SleepMilli(200); + d_mac = ReadRal(0) | ((uint64_t)ReadRah(0) << 32); + // ebbrt::kprintf("mac %p valid = %x\n", d_mac, ReadRahAv(0)); + for (auto i = 0; i < 6; i++) { + mac_addr_[i] = (d_mac >> (i * 8)) & 0xFF; + } + ebbrt::kprintf( + "Mac Address: %02X:%02X:%02X:%02X:%02X:%02X\n", + static_cast(mac_addr_[0]), static_cast(mac_addr_[1]), + static_cast(mac_addr_[2]), static_cast(mac_addr_[3]), + static_cast(mac_addr_[4]), static_cast(mac_addr_[5])); + + // Wait for DMA initialization + while (ReadRdrxctlDmaidone() == 0) { + }; // TODO: Timeout + + // Wait for link to come up + while (!ReadLinksLinkUp()) { + }; // TODO: timeout + ebbrt::kprintf("Link is up\n"); + ebbrt::clock::SleepMilli(50); + + // clears on read + WriteEicr(0xFFFFFFFF); + + /* setup msix */ + // switch to msix mode + WriteGpie(0x1 << 4); // Multiple_MSIX + WriteGpie(0x1 << 31); // PBA_support + WriteGpie(0x1 << 5); // OCD + + // TODO: Set up management interrupt handler + + // Enable auto masking of interrupt + WriteGpie(0x1 << 30); // EIAME + +#ifdef RSC_EN + // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us + // Recommended value based on 7.3.2.1.1 + WriteGpie(0x1 << 11); +#endif + + /* FreeBSD: + * ixgbe_common.c - s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw) + * Places the MAC address in receive address register 0 and clears the rest + * of the receive address registers. Clears the multicast table. Assumes + * the receiver is in reset when the routine is called. + */ + // Initialize RX filters + + /* Zero out the other receive addresses. */ + for (auto i = 1; i < 128; i++) { + WriteRal(i, 0x0); + WriteRah(i, 0x0); + } + + // clear mta + for (auto i = 0; i < 128; i++) { + WriteMta(i, 0x0); + } + + // No init uta tables? + + // set vlan filter table + for (auto i = 0; i < 128; i++) { + WriteVfta(i, 0x0); + } + + for (auto i = 0; i < 64; i++) { + // WritePfvlvf(i, 0x1 << 31); // VI_En bit 31 + WritePfvlvf(i, 0x0); + WritePfvlvfb(i, 0x0); + // WritePsrtypeZero(0x0); + } + + // PF Unicast Table Array + for (auto i = 0; i < 128; i++) { + WritePfuta(i, 0x0); + } + + // not sure why initing these tables? + for (auto i = 0; i < 128; i++) { + WriteFhft_1(i, 0x0); + if (i < 64) { + WriteFhft_2(i, 0x0); + } + } + + // enable ECC Reporting TODO - causes interrupts to be broken?? + // WriteRxfeccerr0(0x1 << 9); + + /**** Initialize RX filters ****/ + // FreeBSD if_ix.c - ixgbe_initialize_receive_units - Enable broadcast accept + WriteFctrl(0x1 << 10); // Set BAM = 1 + + // TODO VLNCTRL + WriteMcstctrl(0x0); + +#ifndef RSC_EN + WriteRxcsum(0x1 << 12); // IP payload checksum enable +#endif +// TODO RQTC + +#ifdef RSC_EN + WriteRfctl(0x0); +#else + WriteRfctl(0x1 << 5); +#endif + + for (auto i = 0; i < 256; i++) { + WriteMpsar(i, 0x0); + } + + // TODO RSSRK + + for (auto i = 0; i < 32; i++) { + WriteReta(i, 0x0); + } + + for (auto i = 0; i < 128; i++) { + WriteFtqf(i, 0x0); + WriteSaqf(i, 0x0); + WriteDaqf(i, 0x0); + WriteSdpqf(i, 0x0); + } + + // TODO SYNQF + // TODO ETQF + // TODO ETQS + + // Make sure RX CRC strip enabled in HLREG0 and RDRXCTL + WriteRdrxctlRSCFRSTSIZE(~(0x1F << 17)); // s/w set to 0 + WriteRdrxctl(0x1 << 1); // CRCStrip + WriteHlreg0(0x1 << 1); // CRCStrip + WriteRdrxctl(0x1 << 25); // RSCACKC s/w set to 1 + WriteRdrxctl(0x1 << 26); // FCOE_WRFIX s/w set to 1 + // TODO RSCDBU + + /***** END RX FILTER *****/ + + // Configure buffers etc. according to specification + // Section 4.6.11.3.4 (no DCB, no virtualization) + + /* Transmit Init: Set RTTDCS.ARBDIS to 1b. + * Program DTXMXSZRQ, TXPBSIZE, TXPBTHRESH, MTQC, and MNGTXMAP, according + * to the DCB and virtualization modes (see Section 4.6.11.3). + * Clear RTTDCS.ARBDIS to 0b. + */ + WriteRttdcs(0x1 << 6); + WriteDtxmxszrq(0xFFF); + WriteTxpbsize(0, 0xA0 << 10); + WriteTxpbThresh(0, 0xA0); + for (auto i = 1; i < 8; i++) { + WriteTxpbsize(i, 0x0); + WriteTxpbThresh(i, 0x0); + } + WriteMtqc(0x0); + WriteMngtxmap(0x0); + WriteRttdcsArbdisEn(~(0x1 << 6)); + + /* Receive Init: Program RXPBSIZE, MRQC, PFQDE, RTRUP2TC, MFLCN.RPFCE, + * and MFLCN.RFCE according to the DCB and virtualization modes + */ + WriteRxpbsize(0, 0x200 << 10); + for (auto i = 1; i < 8; i++) { + WriteRxpbsize(i, 0x0); + } + WriteMrqc(0x0); + WritePfqde(0x0); + WriteRtrup2tc(0x0); + WriteMflcn(0x0 << 2); + WriteMflcn(0x1 << 3); + // end DCB off, VT off + + // TODO Enable Jumbo Packets + + // disable relaxed ordering + for (auto i = 0; i < 128; i++) { + WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); // Txdesc_Wbro + + if (i < 64) { + WriteDcaRxctrl_1( + i, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_1( + i, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } else { + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } + } + +#ifdef DCA_ENABLE + // DCA_MODE = DCA 1.0 + WriteDcaCtrl(0x1 << 1); +#endif +} + +void ebbrt::IxgbeDriver::FinishSetup() { + // No snoop disable from FreeBSD ?? + WriteCtrlExt(0x1 << 16); // NS_DIS + for (size_t i = 0; i < Cpu::Count(); i++) { + WriteDcaRxctrlClear(i, ~(0x1 << 12)); // clear bit 12 + } + WriteEims(0xFFFF); +} + +// initializes per core rx/tx queues and interrupts +void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { + if (!rcv_vector) { + rcv_vector = + event_manager->AllocateVector([this]() { ebb_->ReceivePoll(); }); + } + + // allocate memory for descriptor rings + ixgmq[i].reset(new e10Kq(i, Cpu::GetMyNode())); + + // not going to set up receive descripts greater than 63 + ebbrt::kbugon(i >= 64, "can't set up descriptors greater than 63\n"); + + // update register RDBAL, RDBAH with receive descriptor base address + WriteRdbal_1(i, ixgmq[i]->rxaddr_ & 0xFFFFFFFF); + WriteRdbah_1(i, (ixgmq[i]->rxaddr_ >> 32) & 0xFFFFFFFF); + + // set to number of bytes allocated for receive descriptor ring + WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); + + // program srrctl register + WriteSrrctlZero(i); + WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket + WriteSrrctl_1(i, (128 / 64) << 8); // bsizeheader + +// TODO headsplit adv +#ifdef RSC_EN + WriteSrrctl_1(i, 0x1 << 25); // desctype adv +#else + // legacy is default?? + WriteSrrctl_1(i, ~(0x7 << 25)); // desctype legacy +#endif + + WriteSrrctl_1(i, 0x1 << 28); // Drop_En + +#ifdef RSC_EN + // RSC set up + WriteRscctl(i, 0x3 << 2); // MAXDESC + WriteRscctl(i, 0x1); // RSCEN + WritePsrtypeZero(i); + WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. +#endif + + // Set head and tail pointers + WriteRdt_1(i, 0x0); + WriteRdh_1(i, 0x0); + + // Set Enable bit in receive queue + WriteRxdctl_1_enable(i, 0x1 << 25); + // TODO: Timeout + while (ReadRxdctl_1_enable(i) == 0) + ; + + // setup RX interrupts for queue i + dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); + + // don't set up interrupts for tx since we have head writeback?? + auto qn = i / 2; // put into correct IVAR + + if ((i % 2) == 0) { // check if 2xN or 2xN + 1 + WriteIvarAlloc0(qn, i); // rx interrupt allocation corresponds to index i * + // 2 in MSI-X table + WriteIvarAllocval0(qn, 0x1 << 7); + } else { + WriteIvarAlloc2(qn, i << 16); + WriteIvarAllocval2(qn, 0x1 << 23); + } + + // must be greater than rsc delay + // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us + WriteEitr(i, 0x7 << 3); // 16 * 2us = 32 us + + // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. + // The hardware setting for interrupts 16...63 is always auto clear. + if (i < 16) { + // enable auto clear + WriteEiac(0x1 << i); + } + + // enable interrupt + WriteEimsn(i / 32, (0x1 << (i % 32))); + + // make sure interupt is cleared + if (i < 16) { + WriteEicr(0x1 << i); + } + + // Enable RX + // disable RX_DIS + WriteSecrxctrl_Rx_Dis(0x1 << 1); + // TODO Timeout + while (ReadSecrxstat_Sr_Rdy() == 0) + ; + WriteRxctrl(0x1); + // enable RX_DIS + WriteSecrxctrl_Rx_Dis(0x0 << 1); + + // add buffer to each descriptor + for (size_t j = 0; j < NRXDESCS - 1; j++) { + auto rxphys = + reinterpret_cast((ixgmq[i]->circ_buffer_[j])->MutData()); + auto tail = ixgmq[i]->rx_tail_; + +// update buffer address for descriptor +#ifdef RSC_EN + rdesc_adv_rf_t* tmp; + tmp = reinterpret_cast(&(ixgmq[i]->rx_ring_[tail])); + + tmp->packet_buffer = rxphys; + // TODO only use this if enabling header splitting? + tmp->header_buffer = 0; +#else + ixgmq[i]->rx_ring_[tail].buffer_address = rxphys; +#endif + + ixgmq[i]->rx_tail_ = (tail + 1) % ixgmq[i]->rx_size_; + } + + // bump tail pts via register rdt to enable descriptor fetching by setting to + // length of ring minus one + WriteRdt_1(i, ixgmq[i]->rx_tail_); + +#ifdef DCA_ENABLE + auto myapic = ebbrt::Cpu::GetByIndex(i)->apic_id(); + + WriteDcaRxctrl(i, 0x1 << 5); // Descriptor DCA EN + WriteDcaRxctrl(i, 0x1 << 6); // Rx Header DCA EN + WriteDcaRxctrl(i, 0x1 << 7); // Payload DCA EN + + WriteDcaRxctrl(i, myapic << 24); // CPUID = apic id + + WriteDcaTxctrl(i, 0x1 << 5); // DCA Enable + WriteDcaTxctrl(i, myapic << 24); // CPUID = apic id +#endif + + // program base address registers + WriteTdbal(i, ixgmq[i]->txaddr_ & 0xFFFFFFFF); + WriteTdbah(i, (ixgmq[i]->txaddr_ >> 32) & 0xFFFFFFFF); + + // length must also be 128 byte aligned + WriteTdlen(i, ixgmq[i]->tx_size_bytes_); + +#ifdef TX_HEAD_WB + WriteTdwbal(i, (ixgmq[i]->txhwbaddr_ & 0xFFFFFFFF) | 0x1); + WriteTdwbah(i, (ixgmq[i]->txhwbaddr_ >> 32) & 0xFFFFFFFF); +#endif + + // enable transmit path + WriteDmatxctl_te(0x1); + + // transmit queue enable + WriteTxdctl(i, 0x1 << 25); + + // poll until set, TODO: Timeout + while (ReadTxdctl_enable(i) == 0) + ; + + // TODO: set up dca txctrl FreeBSD? + // clear TXdescWBROen + WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); +} + +// after packet received, need to make sure device can reuse +void ebbrt::IxgbeDriverRep::ReclaimRx() { + for (size_t i = 0; i < ixgmq_.rsc_chain_.size(); i++) { + // bump tail ptr + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + auto n = ixgmq_.rsc_chain_[i].first; + + // reset buffer + ixgmq_.rx_ring_[n].raw[0] = 0; + ixgmq_.rx_ring_[n].raw[1] = 0; + // allocate new rx buffer + ixgmq_.circ_buffer_[n] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[n])->MutData()); + // update buffer with new adder + ixgmq_.rx_ring_[n].buffer_address = rxphys; + } +} + +// keep check for new packets to receive +// may wait for RSC to be done +uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, + uint64_t* rxflag, bool* process_rsc, + uint32_t* rnt) { +#ifdef RSC_EN + rdesc_adv_wb_t* tmp; + tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + + // if rx packet not ready + if (!(tmp->dd)) { + return 1; + } + + auto rsccnt = tmp->rsccnt; + + // not RSC, handled normally + if (rsccnt == 0 && tmp->eop) { + *len = tmp->pkt_len; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } + // not sure what case this is, no context started, eop is set but rsccnt > 0 + else if (rsccnt > 0 && tmp->eop && !(ixgmq_.rsc_used)) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + *len = tmp->pkt_len; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } + // START NEW RSC CONTEXT + else if (rsccnt > 0 && !(tmp->eop) && !(ixgmq_.rsc_used)) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + ixgmq_.rsc_used = true; + ixgmq_.rsc_chain_.clear(); + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 1; + } + // APPEND TO EXISTING RSC CONTEXT + else if (rsccnt > 0 && !(tmp->eop) && ixgmq_.rsc_used) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 1; + } + // LAST RSC CONTEXT + else if (rsccnt > 0 && tmp->eop && ixgmq_.rsc_used) { + ixgmq_.rsc_used = false; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + *process_rsc = true; + + return 0; + } else { + // shouldn't hit here + ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); + } + +#else + // no RSC so just get one packet at a time + int c = static_cast(Cpu::GetMine()); + rdesc_legacy_t tmp; + tmp = ixgmq_.rx_ring_[ixgmq_.rx_head_]; + + if (tmp.dd && tmp.eop) { + *len = tmp.length; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp.l4cs) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp.tcpe)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp.ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp.ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } +#endif + + return 1; +} + +void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t len; + uint64_t bAddr; + uint64_t rxflag; + bool process_rsc; + uint32_t count; + uint32_t rnt; + static bool ret = false; + process_rsc = false; + +retry: + rxflag = 0; + count = 0; + rnt = 0; + + // get address of buffer with data + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt) == 0) { + // hit last rsc context, start to process all buffers + if (process_rsc) { + ret = true; + process_rsc = false; + count++; + + auto n = ixgmq_.rsc_chain_[0].first; + auto rsclen = 0; + + // TODO hack - need to set actual length of data else there'll be 0's + // attached + ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[0].second); + + rsclen += ixgmq_.rsc_chain_[0].second; + + // TODO - maybe find better way to rewrite this + auto b = std::move(ixgmq_.circ_buffer_[n]); + + for (size_t x = 1; x < ixgmq_.rsc_chain_.size(); x++) { + count++; + + auto n = ixgmq_.rsc_chain_[x].first; + // TODO hack - need to set actual length of data + ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[x].second); + rsclen += ixgmq_.rsc_chain_[x].second; + b->PrependChain(std::move(ixgmq_.circ_buffer_[n])); + } + + ReclaimRx(); + + root_.itf_.Receive(std::move(b), rxflag); + } else { + // done with buffer addr above, now to reuse it + auto tail = ixgmq_.rx_tail_; + + // bump tail ptr + ixgmq_.rx_tail_ = (tail + 1) % ixgmq_.rx_size_; + + count++; + + if (count > 0) { + auto tail = ixgmq_.rx_tail_; + + // TODO hack - need to set actual length of data otherwise it'll send + // leftover 0's + ixgmq_.circ_buffer_[tail]->SetLength(len); + + // TODO hack - need to reallocate IOBuf after its been moved to Receive + auto b = std::move(ixgmq_.circ_buffer_[tail]); + + ixgmq_.circ_buffer_[tail] = + std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[tail])->MutData()); + + ixgmq_.rx_ring_[tail].buffer_address = rxphys; + + root_.itf_.Receive(std::move(b), rxflag); + } + } + } + + // TODO: Update tail register here or above? + if (count > 0) { + // update reg + WriteRdt_1(Cpu::GetMine(), ixgmq_.rx_tail_); + } + + // keep looping back once we see start of rsc context + if (likely(ret)) { + goto retry; + } +} + +ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) + : root_(root), ixgq_(root_.GetQueue()), + ixgmq_(root.GetMultiQueue(Cpu::GetMine())), + receive_callback_([this]() { ReceivePoll(); }) { + this->ReceivePoll(); +} + +uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} +uint16_t ebbrt::IxgbeDriverRep::ReadRdt_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriverRep::WriteRdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01018 + 0x40 * n, m); +} + +void ebbrt::IxgbeDriverRep::Run() { + while (1) { + ReceivePoll(); + } +} +void ebbrt::IxgbeDriverRep::WriteTdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.5.9 Extended Interrupt Mask Clear Registers — EIMC[n] +// (0x00AB0 + 4*(n-1), n=1...2; WO) +void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { + auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); + root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); +} diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h new file mode 100644 index 00000000..46670a2d --- /dev/null +++ b/src/native/IxgbeDriver.h @@ -0,0 +1,473 @@ +// Copyright Boston University SESA Group 2013 - 2017. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ + +#include "../Align.h" +#include "../MulticoreEbb.h" +#include "../SpinLock.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Debug.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "PageAllocator.h" +#include "Pci.h" +#include "Pfn.h" +#include "SlabAllocator.h" + +// Receive Side Scaling (RSC) enabled +#define RSC_EN +// Direct Cache Access (DCA) enabled +#define DCA_ENABLE +// Transmit Header Writeback enabled +#define TX_HEAD_WB + +namespace ebbrt { + +// Per-core receive and transmit queue +typedef struct { + rdesc_legacy_t* rx_ring; + size_t rx_head; + size_t rx_tail; + size_t rx_size; + + tdesc_legacy_t* tx_ring; + uint32_t* tx_head; + size_t tx_tail; + size_t tx_last_tail; + size_t tx_size; + bool* tx_isctx; + + // buffers holding packet data + std::vector> circ_buffer; +} e10k_queue_t; + +class IxgbeDriverRep; + +class IxgbeDriver : public EthernetDevice { + public: + explicit IxgbeDriver(pci::Device& dev) + : itf_(network_manager->NewInterface(*this)), dev_(dev), + bar0_(dev.GetBar(0)) { + dev_.SetBusMaster(true); + + // set up interrupts, polling won't work after this + auto msix = dev_.MsixEnable(); + kbugon(!msix, "Ixgbe without msix is unsupported\n"); + + // each core gets a queue struct + ixgmq.resize(Cpu::Count()); + } + + static void Create(pci::Device& dev); + static bool Probe(pci::Device& dev) { + if (dev.GetVendorId() == kIxgbeVendorId && + dev.GetDeviceId() == kIxgbeDeviceId && dev.GetFunc() == 0) { + IxgbeDriver::Create(dev); + return true; + } + return false; + } + + void Run(); + void Send(std::unique_ptr buf, PacketInfo pinfo) override; + const EthernetAddress& GetMacAddress() override; + + protected: + static const constexpr uint16_t kIxgbeVendorId = 0x8086; + static const constexpr uint16_t kIxgbeDeviceId = 0x10F8; // 0x10FB; + + /* FreeBSD: + * RxDescriptors Valid Range: 64-4096 Default Value: 256 This value is the + * number of receive descriptors allocated for each RX queue. Increasing this + * value allows the driver to buffer more incoming packets. Each descriptor + * is 16 bytes. A receive buffer is also allocated for each descriptor. + * + * Note: with 8 rings and a dual port card, it is possible to bump up + * against the system mbuf pool limit, you can tune nmbclusters + * to adjust for this. + */ + static const constexpr uint32_t NTXDESCS = 256; + static const constexpr uint32_t NRXDESCS = 256; + // static const constexpr uint32_t NTXDESCS = 4096; + // static const constexpr uint32_t NRXDESCS = 4096; + static const constexpr uint32_t RXBUFSZ = 4096; + // static const constexpr uint32_t RXBUFSZ = 16384; + + // Class with per core queue data structures + class e10Kq { + public: + e10Kq(size_t idx, Nid nid) + : rx_head_(0), rx_tail_(0), rx_size_(NRXDESCS), tx_tail_(0), + tx_last_tail_(0), tx_size_(NTXDESCS), idx_(idx), rxflag_(0), + rsc_used(false), hanc{0} { + + circ_buffer_.reserve(NRXDESCS); + for (uint32_t k = 0; k < NRXDESCS; k++) { + circ_buffer_.emplace_back(MakeUniqueIOBuf(RXBUFSZ, true)); + } + + // rsc_chain_ is a map between receive descriptor number and + // packet len, need packet len to extract out + // packet data else code will read redundant + // zeros if packet len does not use full buffer + // TODO: should be optimized + rsc_chain_.reserve(NRXDESCS); + + // RX ring buffer allocation + auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); + auto order = Fls(sz - 1) - pmem::kPageShift + 1; + auto page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + auto addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + rx_ring_ = static_cast(addr); + + // TX ring buffer allocation + sz = align::Up(sizeof(tdesc_legacy_t) * NTXDESCS, 4096); + order = Fls(sz - 1) - pmem::kPageShift + 1; + page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + tx_ring_ = static_cast(addr); + + // TX adv context buffer allocation + sz = align::Up(sizeof(bool) * NTXDESCS, 4096); + order = Fls(sz - 1) - pmem::kPageShift + 1; + page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + tx_isctx_ = static_cast(addr); + +#ifdef TX_HEAD_WB + // TODO: not sure how much exactly to allocate for head wb addr + tx_head_ = (uint32_t*)malloc(4 * sizeof(uint32_t)); + memset(tx_head_, 0, 4 * sizeof(uint32_t)); + txhwbaddr_ = reinterpret_cast(tx_head_); + // txhwbaddr must be byte aligned + ebbrt::kbugon((txhwbaddr_ & 0x3) != 0, "txhwbaddr not byte aligned\n"); + kassert((txhwbaddr_ & 0x3) == 0); +#else + tx_head_ = 0; +#endif + + // get starting address, need to write to device registers + rxaddr_ = reinterpret_cast(rx_ring_); + txaddr_ = reinterpret_cast(tx_ring_); + rx_size_bytes_ = sizeof(rdesc_legacy_t) * NRXDESCS; + tx_size_bytes_ = sizeof(tdesc_legacy_t) * NTXDESCS; + + // must be 128 byte aligned + ebbrt::kbugon((rxaddr_ & 0x7F) != 0, "rx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((txaddr_ & 0x7F) != 0, "tx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((rx_size_bytes_ & 0x7F) != 0, + "rx_size_bytes_ not 128 byte aligned\n"); + ebbrt::kbugon((tx_size_bytes_ & 0x7F) != 0, + "tx_size_bytes_ not 128 byte aligned\n"); + } + + size_t rx_head_; + size_t rx_tail_; + size_t rx_size_; + size_t tx_tail_; + size_t tx_last_tail_; + size_t tx_size_; + size_t idx_; + size_t rx_size_bytes_; + size_t tx_size_bytes_; + uint64_t rxaddr_; + uint64_t txaddr_; + uint64_t txhwbaddr_; + uint64_t rxflag_; + + std::vector> circ_buffer_; + std::vector> rsc_chain_; + + rdesc_legacy_t* rx_ring_; + tdesc_legacy_t* tx_ring_; + bool* tx_isctx_; + bool rsc_used; + int hanc; +#ifdef TX_HEAD_WB + uint32_t* tx_head_; +#else + size_t tx_head_; +#endif + }; + + private: + EbbRef ebb_; + NetworkManager::Interface& itf_; + EthernetAddress mac_addr_; + + void Init(); + void PhyInit(); + void StopDevice(); + void GlobalReset(); + void SetupMultiQueue(uint32_t i); + void FinishSetup(); + + // device register writing code below + bool SwsmSmbiRead(); + void SwsmSmbiClear(); + + void SwsmSwesmbiSet(); + bool SwsmSwesmbiRead(); + void SwsmSwesmbiClear(); + + uint32_t ReadSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits2(uint32_t m); + + bool SwfwLockPhy(); + void SwfwUnlockPhy(); + bool SwfwSemAcquire(); + void SwfwSemRelease(); + + void WriteRxctrl(uint32_t m); + void WriteDmatxctl(uint32_t m); + void WriteDmatxctl_te(uint32_t m); + + void WriteEimc(uint32_t m); + void WriteEitr(uint32_t n, uint32_t m); + + void WriteTxdctl(uint32_t n, uint32_t m); + + void WriteRxdctl_1(uint32_t n, uint32_t m); + void WriteRxdctl_1_enable(uint32_t n, uint32_t m); + + void WriteRxdctl_2(uint32_t n, uint32_t m); + void WriteCtrl(uint32_t m); + void WriteCtrlExt(uint32_t m); + void WriteFcttv(uint32_t n, uint32_t m); + void WriteFcrtl(uint32_t n, uint32_t m); + void WriteFcrth(uint32_t n, uint32_t m); + void WriteFcrtv(uint32_t m); + void WriteFccfg(uint32_t m); + void WriteEerd(uint32_t m); + + void WriteCorectl(uint16_t m); + + void WriteAutoc(uint32_t m); + + void WriteEicr(uint32_t m); + void WriteGpie(uint32_t m); + + void WriteEims(uint32_t m); + + void WriteRal(uint32_t n, uint32_t m); + void WriteRah(uint32_t n, uint32_t m); + + void WriteMta(uint32_t n, uint32_t m); + void WriteVfta(uint32_t n, uint32_t m); + void WritePfvlvf(uint32_t n, uint32_t m); + void WritePfvlvfb(uint32_t n, uint32_t m); + void WriteMpsar(uint32_t n, uint32_t m); + void WriteFtqf(uint32_t n, uint32_t m); + void WriteSaqf(uint32_t n, uint32_t m); + void WriteDaqf(uint32_t n, uint32_t m); + void WriteSdpqf(uint32_t n, uint32_t m); + + void WriteFctrl(uint32_t m); + void WriteFhft_1(uint32_t n, uint32_t m); + void WriteFhft_2(uint32_t n, uint32_t m); + + void WritePfuta(uint32_t n, uint32_t m); + void WriteMcstctrl(uint32_t m); + + void WriteRttdqsel(uint32_t m); + void WriteRttbcnrc(uint32_t m); + + void WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m); + void WriteDcaTxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrlClear(uint32_t n, uint32_t m); + void WriteDcaRxctrl_1(uint32_t n, uint32_t m); + void WriteDcaRxctrl_2(uint32_t n, uint32_t m); + void WriteDcaCtrl(uint32_t m); + + void WriteRdbal_1(uint32_t n, uint32_t m); + void WriteRdbal_2(uint32_t n, uint32_t m); + + void WriteRdbah_1(uint32_t n, uint32_t m); + void WriteRdbah_2(uint32_t n, uint32_t m); + + void WriteRdlen_1(uint32_t n, uint32_t m); + void WriteRdlen_2(uint32_t n, uint32_t m); + + void WriteSrrctl_1(uint32_t n, uint32_t m); + void WriteSrrctlZero(uint32_t n); + void WriteSrrctl_1_desctype(uint32_t n, uint32_t m); + void WriteRscdbu(uint32_t m); + + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + void WriteRdt_2(uint32_t n, uint32_t m); + + void WriteIvarAlloc0(uint32_t n, uint32_t m); + void WriteIvarAllocval0(uint32_t n, uint32_t m); + void WriteIvarAlloc1(uint32_t n, uint32_t m); + void WriteIvarAllocval1(uint32_t n, uint32_t m); + void WriteIvarAlloc2(uint32_t n, uint32_t m); + void WriteIvarAllocval2(uint32_t n, uint32_t m); + void WriteIvarAlloc3(uint32_t n, uint32_t m); + void WriteIvarAllocval3(uint32_t n, uint32_t m); + + void WriteSecrxctrl_Rx_Dis(uint32_t m); + + void WriteTdbal(uint32_t n, uint32_t m); + void WriteTdbah(uint32_t n, uint32_t m); + void WriteTdlen(uint32_t n, uint32_t m); + + void WriteTdh(uint32_t n, uint32_t m); + void WriteTdt(uint32_t n, uint32_t m); + + void WriteTdwbal(uint32_t n, uint32_t m); + void WriteTdwbah(uint32_t n, uint32_t m); + + void WriteHlreg0(uint32_t m); + void WriteRdrxctl(uint32_t m); + void WriteRdrxctlRSCFRSTSIZE(uint32_t m); + + void WriteEiac(uint32_t m); + void WriteEimsn(uint32_t n, uint32_t m); + + void WriteRfctl(uint32_t m); + + void WriteRscctl(uint32_t n, uint32_t m); + void WritePsrtype(uint32_t n, uint32_t m); + + void WriteRxcsum(uint32_t m); + void WriteTxpbthresh(uint32_t n, uint32_t m); + void WriteMrqc(uint32_t m); + void WriteDtxmxszrq(uint32_t m); + void WriteMflcn(uint32_t m); + void WriteReta(uint32_t n, uint32_t m); + + void WritePsrtypeZero(uint32_t n); + + void WriteRttdcs(uint32_t m); + void WriteRttdcsArbdisEn(uint32_t m); + void WriteRxpbsize(uint32_t n, uint32_t m); + void WriteTxpbsize(uint32_t n, uint32_t m); + void WriteTxpbThresh(uint32_t n, uint32_t m); + void WriteMtqc(uint32_t m); + void WritePfvtctl(uint32_t m); + void WriteRtrup2tc(uint32_t m); + void WriteRttup2tc(uint32_t m); + void WritePfqde(uint32_t m); + void WriteRttdt1c(uint32_t m); + void WriteRttdt2c(uint32_t n, uint32_t m); + void WriteRttpt2c(uint32_t n, uint32_t m); + void WriteRtrpt4c(uint32_t n, uint32_t m); + void WriteRttpcs(uint32_t m); + void WriteRtrpcs(uint32_t m); + void WritePfvml2flt(uint32_t n, uint32_t m); + + void WriteMngtxmap(uint32_t m); + + void WriteRxfeccerr0(uint32_t m); + + uint8_t ReadRdrxctlDmaidone(); + + void ReadEicr(); + bool ReadStatusPcieMes(); + uint8_t ReadStatusLanId(); + void ReadCtrl(); + bool ReadEerdDone(); + uint16_t ReadEerdData(); + uint16_t ReadEeprom(uint16_t offset); + uint8_t ReadAnlp1(); + uint8_t ReadAutocRestartAn(); + uint8_t ReadEecAutoRd(); + uint32_t ReadEims(); + + uint32_t ReadRal(uint32_t n); + uint16_t ReadRah(uint32_t n); + uint8_t ReadRahAv(uint32_t n); + + uint8_t ReadRxdctl_1_enable(uint32_t n); + uint8_t ReadSecrxstat_Sr_Rdy(); + + uint8_t ReadTxdctl_enable(uint32_t n); + + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadTdh(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + + // some statistics + uint32_t ReadTpr(); + uint32_t ReadGprc(); + bool ReadLinksLinkUp(); + + // Process packet functions + void ProcessPacket(uint32_t n); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr); + void SendPacket(uint32_t n); + + e10k_queue_t& GetQueue() const { return *ixgq; } + + e10Kq& GetMultiQueue(size_t index) const { return *ixgmq[index]; } + + pci::Device& dev_; + pci::Bar& bar0_; + + struct IxgbeRegs { + volatile uint32_t kIxgbeCtrl; + volatile uint32_t kIxgbeCtrlBak; + volatile uint32_t kIxgbeStatus; + }; + + e10k_queue_t* ixgq; + uint8_t rcv_vector{0}; + + std::vector> ixgmq; + + friend class IxgbeDriverRep; +}; // class IxgbeDriver + +class IxgbeDriverRep : public MulticoreEbb { + public: + explicit IxgbeDriverRep(const IxgbeDriver& root); + void Run(); + void ReceivePoll(); + void ReclaimTx(); + void ReclaimRx(); + void Send(std::unique_ptr buf, PacketInfo pinfo); + void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, + enum l4_type l4type); + void AddTx(const uint8_t* pa, uint64_t len, uint64_t totallen, bool first, + bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum); + + private: + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + // uint16_t ReadRdt_1(uint32_t n); + // uint16_t ReadRdh_1(uint32_t n); + void WriteTdt_1(uint32_t n, uint32_t m); + void WriteEimcn(uint32_t n, uint32_t m); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, + bool* process_rsc, uint32_t* rnt); + + const IxgbeDriver& root_; + e10k_queue_t& ixgq_; + IxgbeDriver::e10Kq& ixgmq_; + + EventManager::IdleCallback receive_callback_; + +}; // class IxgbeDriverRep + +} // namespace ebbrt + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ diff --git a/src/native/Main.cc b/src/native/Main.cc index de40afc9..0588ad86 100644 --- a/src/native/Main.cc +++ b/src/native/Main.cc @@ -47,7 +47,11 @@ #include "Trans.h" #include "VMem.h" #include "VMemAllocator.h" +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ +#include "IxgbeDriver.h" +#else #include "VirtioNet.h" +#endif namespace { bool started_once = false; @@ -146,18 +150,29 @@ ebbrt::Main(multiboot::Information* mbi) { Timer::Init(); smp::Init(); event_manager->ReceiveToken(); + #ifdef __EBBRT_ENABLE_NETWORKING__ NetworkManager::Init(); pci::Init(); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + pci::RegisterProbe(IxgbeDriver::Probe); +#else pci::RegisterProbe(VirtioNetDriver::Probe); +#endif + pci::LoadDrivers(); network_manager->StartDhcp().Then([](Future fut) { fut.Get(); // Dhcp completed #ifdef __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ +// Currently not supported in BMNIC since we don't pass arguments +// via grub +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ Messenger::Init(); runtime::Init(); #endif +#endif #endif // run global ctors for (unsigned i = 0; i < (end_ctors - start_ctors); ++i) { diff --git a/src/native/Msr.h b/src/native/Msr.h index 4e3b7ba6..9adc0699 100644 --- a/src/native/Msr.h +++ b/src/native/Msr.h @@ -30,6 +30,17 @@ inline uint64_t Read(uint32_t index) { inline void Write(uint32_t index, uint64_t data) { uint32_t low = data; uint32_t high = data >> 32; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - correct fix is here? + // GP fault happens when writing a 1 to bit #3 for kX2apicDcr, + // which is a reserved bit + // only happens in baremetal, VM prob virtualized this issue + if ((((data >> 2) & 0x1) == 1) && index == kX2apicDcr) { + low = (data & 0x3) | ((data & 0x4) << 1); + high = 0x0; + } +#endif asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); } } // namespace msr diff --git a/src/native/Net.cc b/src/native/Net.cc index 581cec54..d884b7e0 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -12,7 +12,8 @@ ebbrt::NetworkManager::NewInterface(EthernetDevice& ether_dev) { return *interface_; } -void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Drop packets that are too small @@ -26,7 +27,7 @@ void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf) { switch (ntohs(eth_header.type)) { case kEthTypeIp: { - ReceiveIp(eth_header, std::move(buf)); + ReceiveIp(eth_header, std::move(buf), rxflag); break; } case kEthTypeArp: { diff --git a/src/native/Net.h b/src/native/Net.h index d7b575cb..68713d48 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -25,9 +25,16 @@ #include "RcuTable.h" #include "SharedPoolAllocator.h" +// IP and L4 checksum offload bits +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + namespace ebbrt { struct PacketInfo { static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kNeedsIpCsum = 2; static const constexpr uint8_t kGsoNone = 0; static const constexpr uint8_t kGsoTcpv4 = 1; static const constexpr uint8_t kGsoUdp = 3; @@ -230,7 +237,7 @@ class NetworkManager : public StaticSharedEbb { explicit Interface(EthernetDevice& ether_dev) : address_(nullptr), ether_dev_(ether_dev) {} - void Receive(std::unique_ptr buf); + void Receive(std::unique_ptr buf, uint64_t rxflag = 0); void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()); void SendUdp(UdpPcb& pcb, Ipv4Address addr, uint16_t port, std::unique_ptr buf); @@ -260,11 +267,14 @@ class NetworkManager : public StaticSharedEbb { }; void ReceiveArp(EthernetHeader& eh, std::unique_ptr buf); - void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf); + void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveIcmp(EthernetHeader& eh, Ipv4Header& ih, std::unique_ptr buf); - void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf); - void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf); + void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); + void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveDhcp(Ipv4Address from_addr, uint16_t from_port, std::unique_ptr buf); void EthArpSend(uint16_t proto, const Ipv4Header& ih, diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index e5c06153..6ecfde0d 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -19,9 +19,11 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( auto dp = buf->GetMutDataPointer(); auto& icmp_header = dp.Get(); - // checksum +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ + // software checksum if (IpCsum(*buf)) return; +#endif // if echo_request, send reply if (icmp_header.type == kIcmpEchoRequest) { @@ -43,9 +45,19 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( ip_header.ttl = kIpDefaultTtl; ip_header.chksum = 0; + + PacketInfo pinfo; + pinfo.flags = 0; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // hardware ip checksum offload + pinfo.flags |= PacketInfo::kNeedsIpCsum; +#else ip_header.chksum = ip_header.ComputeChecksum(); +#endif buf->Retreat(ip_header.HeaderLength()); - EthArpSend(kEthTypeIp, ip_header, std::move(buf)); + + EthArpSend(kEthTypeIp, ip_header, std::move(buf), pinfo); } } diff --git a/src/native/NetIp.cc b/src/native/NetIp.cc index 683311f3..339b613b 100644 --- a/src/native/NetIp.cc +++ b/src/native/NetIp.cc @@ -28,8 +28,9 @@ bool ebbrt::NetworkManager::Interface::ItfAddress::isLocalNetwork( } // Receive an Ipv4 packet -void ebbrt::NetworkManager::Interface::ReceiveIp( - EthernetHeader& eth_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveIp(EthernetHeader& eth_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); if (unlikely(packet_len < sizeof(Ipv4Header))) @@ -51,8 +52,21 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( buf->TrimEnd(packet_len - tot_len); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal checksum offload + if (unlikely((rxflag & RXFLAG_IPCS) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS failed\n", __FUNCTION__); + return; + } + + if (unlikely((rxflag & RXFLAG_IPCS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS_VALID failed\n", __FUNCTION__); + return; + } +#else if (unlikely(ip_header.ComputeChecksum() != 0)) return; +#endif auto addr = Address(); // Unless the protocol is UDP or we have an address on this interface and the @@ -79,11 +93,11 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( break; } case kIpProtoUDP: { - ReceiveUdp(ip_header, std::move(buf)); + ReceiveUdp(ip_header, std::move(buf), rxflag); break; } case kIpProtoTCP: { - ReceiveTcp(ip_header, std::move(buf)); + ReceiveTcp(ip_header, std::move(buf), rxflag); break; } } @@ -115,9 +129,14 @@ void ebbrt::NetworkManager::Interface::SendIp(std::unique_ptr buf, ih.chksum = 0; ih.src = src; ih.dst = dst; - ih.chksum = ih.ComputeChecksum(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal ip checksum offload + pinfo.flags |= PacketInfo::kNeedsIpCsum; +#else + ih.chksum = ih.ComputeChecksum(); kassert(ih.ComputeChecksum() == 0); +#endif pinfo.csum_start += sizeof(Ipv4Header); pinfo.hdr_len += sizeof(Ipv4Header); diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index e792f34a..c0f833fb 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -194,8 +194,9 @@ ebbrt::Ipv4Address ebbrt::NetworkManager::TcpPcb::GetRemoteAddress() { } // Receive a TCP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveTcp( - const Ipv4Header& ih, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveTcp(const Ipv4Header& ih, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -210,10 +211,21 @@ void ebbrt::NetworkManager::Interface::ReceiveTcp( if (unlikely(addr->isBroadcast(ih.dst) || ih.dst.isMulticast())) return; - // XXX: Check if rxcsum is supported - // if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) - // return; +// XXX: Check if rxcsum is supported +// if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto hdr_len = tcp_header.HdrLen(); if (unlikely(hdr_len < sizeof(TcpHeader) || hdr_len > packet_len)) return; diff --git a/src/native/NetUdp.cc b/src/native/NetUdp.cc index 7da5fdc4..992ee21f 100644 --- a/src/native/NetUdp.cc +++ b/src/native/NetUdp.cc @@ -57,8 +57,9 @@ void ebbrt::NetworkManager::UdpPcb::Receive( } // Receive UDP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveUdp( - Ipv4Header& ip_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveUdp(Ipv4Header& ip_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -75,10 +76,20 @@ void ebbrt::NetworkManager::Interface::ReceiveUdp( // trim any excess off the packet buf->TrimEnd(packet_len - ntohs(udp_header.length)); - // XXX: Check if rxcsum supported - // if (udp_header.checksum && - // IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) - // return; +// XXX: Check if rxcsum supported +// if (udp_header.checksum && +// IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto entry = network_manager->udp_pcbs_.find(ntohs(udp_header.dst_port)); diff --git a/src/native/Pci.cc b/src/native/Pci.cc index cdd53dae..2a740f7e 100644 --- a/src/native/Pci.cc +++ b/src/native/Pci.cc @@ -9,6 +9,7 @@ #include "../Align.h" #include "../ExplicitlyConstructed.h" #include "Debug.h" +#include "GeneralPurposeAllocator.h" #include "Io.h" #include "VMem.h" #include "VMemAllocator.h" @@ -34,7 +35,11 @@ uint8_t PciRead8(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { } uint16_t PciRead16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { PciSetAddr(bus, device, func, offset); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + return ebbrt::io::In16(kPciDataPort + (offset & 2)); +#else return ebbrt::io::In16(kPciDataPort); +#endif } uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { @@ -45,7 +50,12 @@ uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { void PciWrite16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, uint16_t val) { PciSetAddr(bus, device, func, offset); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + ebbrt::io::Out16(kPciDataPort + (offset & 2), val); +#else ebbrt::io::Out16(kPciDataPort, val); +#endif } void PciWrite32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, @@ -71,8 +81,12 @@ void EnumerateBus(uint8_t bus) { if (dev) continue; + dev.DumpAddress(); + dev.DumpInfo(); + if (dev.IsBridge()) { - ebbrt::kabort("Secondary bus unsupported!\n"); + // ebbrt::kabort("Secondary bus unsupported!\n"); + continue; } else { devices->emplace_back(bus, device, func); } @@ -101,6 +115,11 @@ void ebbrt::pci::Init() { devices.construct(); driver_probes.construct(); EnumerateAllBuses(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - Kludge to identify where NIC sits in device tree, should incorporate + // Dan's pull request for enumerating bridges + EnumerateBus(0x1); +#endif } void ebbrt::pci::RegisterProbe(std::function probe) { @@ -149,6 +168,18 @@ uint16_t ebbrt::pci::Function::GetCommand() const { return Read16(kCommandAddr); } +uint8_t ebbrt::pci::Function::GetClassCode() const { + return Read8(kClassCodeAddr); +} + +uint8_t ebbrt::pci::Function::GetFunc() const { return func_; } + +uint8_t ebbrt::pci::Function::GetSubclass() const { + return Read8(kSubclassAddr); +} + +uint8_t ebbrt::pci::Function::GetProgIf() const { return Read8(kProgIfAddr); } + uint8_t ebbrt::pci::Function::GetHeaderType() const { return Read8(kHeaderTypeAddr) & ~kHeaderMultifuncMask; } @@ -187,6 +218,11 @@ void ebbrt::pci::Function::DumpAddress() const { kprintf("%u:%u:%u\n", bus_, device_, func_); } +void ebbrt::pci::Function::DumpInfo() const { + kprintf("Vendor ID: 0x%x ", GetVendorId()); + kprintf("Device ID: 0x%x\n", GetDeviceId()); +} + ebbrt::pci::Bar::Bar(pci::Device& dev, uint32_t bar_val, uint8_t idx) : vaddr_(nullptr), is_64_(false), prefetchable_(false) { mmio_ = !(bar_val & kIoSpaceFlag); @@ -226,6 +262,8 @@ ebbrt::pci::Bar::~Bar() { kbugon(vaddr_ != nullptr, "pci::Bar: Need to free mapped region\n"); } +void* ebbrt::pci::Bar::GetVaddr() { return vaddr_; } + bool ebbrt::pci::Bar::Is64() const { return is_64_; } void ebbrt::pci::Bar::Map() { @@ -233,10 +271,21 @@ void ebbrt::pci::Bar::Map() { return; auto npages = align::Up(size_, pmem::kPageSize) >> pmem::kPageShift; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + auto pf = std::make_unique(); + auto& ref = *pf; + auto page = vmem_allocator->Alloc(npages, std::move(pf)); + vaddr_ = reinterpret_cast(page.ToAddr()); + kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); + vmem::MapMemory(page, Pfn::Down(addr_), size_); + ref.SetMap(page, Pfn::Down(addr_), size_); +#else auto page = vmem_allocator->Alloc(npages); vaddr_ = reinterpret_cast(page.ToAddr()); kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); vmem::MapMemory(page, Pfn::Down(addr_), size_); +#endif } uint8_t ebbrt::pci::Bar::Read8(size_t offset) { @@ -415,7 +464,15 @@ void ebbrt::pci::Device::SetMsixEntry(size_t entry, uint8_t vector, uint8_t dest) { auto& msix_bar = GetBar(msix_bar_idx_); auto offset = msix_table_offset_ + entry * kMsixTableEntrySize; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // more precise + msix_bar.Write32(offset + kMsixTableEntryAddrLow, 0xFEE00000 | dest << 12); + msix_bar.Write32(offset + kMsixTableEntryAddrHigh, 0x0); +#else msix_bar.Write32(offset + kMsixTableEntryAddr, 0xFEE00000 | dest << 12); +#endif + msix_bar.Write32(offset + kMsixTableEntryData, vector); MsixUnmaskEntry(entry); } diff --git a/src/native/Pci.h b/src/native/Pci.h index 15bc1dce..3d40af31 100644 --- a/src/native/Pci.h +++ b/src/native/Pci.h @@ -30,6 +30,7 @@ class Function { uint8_t GetLatencyTimer() const; uint8_t GetHeaderType() const; uint8_t GetBist() const; + uint8_t GetFunc() const; operator bool() const; bool IsMultifunc() const; @@ -40,6 +41,7 @@ class Function { void DisableInt(); void DumpAddress() const; + void DumpInfo() const; protected: static const constexpr uint8_t kVendorIdAddr = 0x00; @@ -87,6 +89,7 @@ class Bar { void Write8(size_t offset, uint8_t val); void Write16(size_t offset, uint16_t val); void Write32(size_t offset, uint32_t val); + void* GetVaddr(); private: static const constexpr uint32_t kIoSpaceFlag = 0x1; @@ -166,6 +169,8 @@ class Device : public Function { static const constexpr size_t kMsixTableEntryAddr = 0; static const constexpr size_t kMsixTableEntryData = 8; static const constexpr size_t kMsixTableEntryControl = 12; + static const constexpr size_t kMsixTableEntryAddrLow = 0; + static const constexpr size_t kMsixTableEntryAddrHigh = 4; static const constexpr uint32_t kMsixTableEntryControlMaskBit = 1; diff --git a/src/native/config.cmake b/src/native/config.cmake index f3831979..6e51122c 100644 --- a/src/native/config.cmake +++ b/src/native/config.cmake @@ -1,6 +1,7 @@ # EbbRT native platform-specific configuration option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" ON) option(__EBBRT_ENABLE_NETWORKING__ "Enable Networking" ON) +option(__EBBRT_ENABLE_BAREMETAL_NIC__ "Enable Baremetal NIC" OFF) option(__EBBRT_ENABLE_TRACE__ "Enable Tracing Subsystem" OFF) option(LARGE_WINDOW_HACK "Enable Large TCP Window Hack" OFF) option(PAGE_CHECKER "Enable Page Checker" OFF) diff --git a/src/native/config.h.in b/src/native/config.h.in index dc9a773e..8ae06f0a 100644 --- a/src/native/config.h.in +++ b/src/native/config.h.in @@ -2,6 +2,7 @@ #cmakedefine __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ #cmakedefine __EBBRT_ENABLE_NETWORKING__ #cmakedefine __EBBRT_ENABLE_TRACE__ +#cmakedefine __EBBRT_ENABLE_BAREMETAL_NIC__ #cmakedefine LARGE_WINDOW_HACK #cmakedefine PAGE_CHECKER #cmakedefine VIRTIO_ZERO_COPY From 92ff77e380822d1a25f43cb259536e1b8c4f4206 Mon Sep 17 00:00:00 2001 From: Han Date: Wed, 29 Nov 2017 15:17:14 -0500 Subject: [PATCH 02/20] feat(baremetal NIC): initial push for baremetal NIC Only works on Intel 82599 family based NICs --- src/IOBuf.h | 3 +- src/native/GeneralPurposeAllocator.h | 20 + src/native/Ixgbe.h | 387 +++++ src/native/IxgbeDriver.cc | 2001 ++++++++++++++++++++++++++ src/native/IxgbeDriver.h | 473 ++++++ src/native/Main.cc | 15 + src/native/Msr.h | 11 + src/native/Net.cc | 5 +- src/native/Net.h | 18 +- src/native/NetIcmp.cc | 16 +- src/native/NetIp.cc | 29 +- src/native/NetTcp.cc | 22 +- src/native/NetUdp.cc | 23 +- src/native/Pci.cc | 59 +- src/native/Pci.h | 5 + src/native/config.cmake | 1 + src/native/config.h.in | 1 + 17 files changed, 3063 insertions(+), 26 deletions(-) create mode 100644 src/native/Ixgbe.h create mode 100644 src/native/IxgbeDriver.cc create mode 100644 src/native/IxgbeDriver.h diff --git a/src/IOBuf.h b/src/IOBuf.h index a405027b..c430f1a3 100644 --- a/src/IOBuf.h +++ b/src/IOBuf.h @@ -7,10 +7,10 @@ #include #include +#include #include #include #include -#include #include @@ -64,6 +64,7 @@ class IOBuf { } void TrimEnd(size_t amount) { length_ -= amount; } + void SetLength(size_t amount) { length_ = amount; } bool IsChained() const { return Next() != this; } diff --git a/src/native/GeneralPurposeAllocator.h b/src/native/GeneralPurposeAllocator.h index 12c91d97..b51869c6 100644 --- a/src/native/GeneralPurposeAllocator.h +++ b/src/native/GeneralPurposeAllocator.h @@ -16,6 +16,26 @@ namespace ebbrt { +// handler used in Pci.cc code to handle faults on multicores when mapping +// device +class MulticorePciFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { + ebbrt::Pfn vpage_; + ebbrt::Pfn ppage_; + size_t size_; + + public: + void SetMap(ebbrt::Pfn va, ebbrt::Pfn pa, size_t s) { + vpage_ = va; + ppage_ = pa; + size_ = s; + } + + void HandleFault(ebbrt::idt::ExceptionFrame* ef, + uintptr_t faulted_address) override { + ebbrt::vmem::MapMemory(vpage_, ppage_, size_); + } +}; + // page fault handler for mapping in physical pages // to virtual pages on all cores class LargeRegionFaultHandler : public ebbrt::VMemAllocator::PageFaultHandler { diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h new file mode 100644 index 00000000..1a966ec1 --- /dev/null +++ b/src/native/Ixgbe.h @@ -0,0 +1,387 @@ +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ + +// from https://github.com/cisco-open-source/ethtool/ixgbe.c + +/* Register Bit Masks */ +#define IXGBE_FCTRL_SBP 0x00000002 +#define IXGBE_FCTRL_MPE 0x00000100 +#define IXGBE_FCTRL_UPE 0x00000200 +#define IXGBE_FCTRL_BAM 0x00000400 +#define IXGBE_FCTRL_PMCF 0x00001000 +#define IXGBE_FCTRL_DPF 0x00002000 +#define IXGBE_FCTRL_RPFCE 0x00004000 +#define IXGBE_FCTRL_RFCE 0x00008000 +#define IXGBE_VLNCTRL_VET 0x0000FFFF +#define IXGBE_VLNCTRL_CFI 0x10000000 +#define IXGBE_VLNCTRL_CFIEN 0x20000000 +#define IXGBE_VLNCTRL_VFE 0x40000000 +#define IXGBE_VLNCTRL_VME 0x80000000 +#define IXGBE_LINKS_UP 0x40000000 +#define IXGBE_LINKS_SPEED 0x20000000 +#define IXGBE_SRRCTL_BSIZEPKT_MASK 0x0000007F +#define IXGBE_HLREG0_TXCRCEN 0x00000001 +#define IXGBE_HLREG0_RXCRCSTRP 0x00000002 +#define IXGBE_HLREG0_JUMBOEN 0x00000004 +#define IXGBE_HLREG0_TXPADEN 0x00000400 +#define IXGBE_HLREG0_LPBK 0x00008000 +#define IXGBE_RMCS_TFCE_802_3X 0x00000008 +#define IXGBE_RMCS_TFCE_PRIORITY 0x00000010 +#define IXGBE_FCCFG_TFCE_802_3X 0x00000008 +#define IXGBE_FCCFG_TFCE_PRIORITY 0x00000010 +#define IXGBE_MFLCN_PMCF 0x00000001 /* Pass MAC Control Frames */ +#define IXGBE_MFLCN_DPF 0x00000002 /* Discard Pause Frame */ +#define IXGBE_MFLCN_RPFCE 0x00000004 /* Receive Priority FC Enable */ +#define IXGBE_MFLCN_RFCE 0x00000008 /* Receive FC Enable */ + +enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; + +#define ETHHDR_LEN 14 +#define IPHDR_LEN 20 +#define UDPHDR_LEN 8 + +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + +/*********************** + * RX + * Descriptors + **********************/ +// 7.1.5 Legacy Receive Descriptor, Table 7 - 11 +typedef union { + + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t fragment_checksum : 16; + + // uint64_t status : 8; + uint64_t dd : 1; + uint64_t eop : 1; + uint64_t rsvd1 : 1; + uint64_t vp : 1; + uint64_t udpcs : 1; + uint64_t l4cs : 1; + uint64_t ipcs : 1; + uint64_t pif : 1; + + // uint64_t errors : 8; + uint64_t rxe : 1; + uint64_t rsvd2 : 1; + uint64_t rsvd3 : 1; + uint64_t rsvd4 : 1; + uint64_t rsvd5 : 1; + uint64_t rsvd6 : 1; + uint64_t tcpe : 1; + uint64_t ipe : 1; + + uint64_t vlan_tag : 16; + }; // struct + + }; // union + + } __attribute__((packed)); // struct + +} rdesc_legacy_t; // typedef union + +// 7.1.6.1 Advanced Receive Descriptors Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t packet_buffer; + uint64_t header_buffer; + } __attribute__((packed)); // struct +} rdesc_adv_rf_t; + +// 7.1.6.2 Advanced Receive Descriptors — Write-Back Format +typedef union { + uint64_t raw[2]; + struct { + union { + uint32_t raw32_1; + struct { + uint32_t rss_type : 4; + + // packet type + uint32_t pt_ipv4 : 1; + uint32_t pt_ipv4e : 1; + uint32_t pt_ipv6 : 1; + uint32_t pt_ipv6e : 1; + uint32_t pt_tcp : 1; + uint32_t pt_udp : 1; + uint32_t pt_sctp : 1; + uint32_t pt_nfs : 1; + uint32_t pt_isesp : 1; + uint32_t pt_isah : 1; + uint32_t pt_linksec : 1; + uint32_t pt_l2packet : 1; + uint32_t pt_rsvd : 1; + + uint32_t rsccnt : 4; + uint32_t hdr_len : 10; + uint32_t sph : 1; + }; + }; // union raw32_1 + + union { + uint32_t raw32_2; + uint32_t rss_hash; + uint32_t fragment_checksum; + uint32_t rtt; + uint32_t fcoe_param; + uint32_t flow_directors_filters_id; // may need more, page 317 + }; // union raw32_2 + + union { + uint32_t raw32_3; + + struct { + // extended status + uint32_t dd : 1; + uint32_t eop : 1; + uint32_t flm : 1; + uint32_t vp : 1; + + // fcstat - 2 bits + uint32_t udpcs : 1; + uint32_t l4i : 1; + + uint32_t ipcs : 1; + uint32_t pif : 1; + uint32_t rsvd_1 : 1; + uint32_t vext : 1; + uint32_t udpv : 1; + uint32_t llint : 1; + uint32_t rsvd_2 : 4; + uint32_t ts : 1; + uint32_t secp : 1; + uint32_t lb : 1; + uint32_t rsvd_3 : 1; + + // extended error + uint32_t fdierr : 3; + uint32_t hbo : 1; + uint32_t rsvd : 3; + uint32_t secerr : 2; + uint32_t rxe : 1; + uint32_t l4e : 1; + uint32_t ipe : 1; + }; // status_last_descriptor; + + struct { + // extended status + uint32_t dd2 : 1; + uint32_t eop2 : 1; + uint32_t rsvd_4 : 2; + uint32_t next_descriptor_ptr : 16; + + // extended error + uint32_t error : 12; + }; // status_non_last_descriptor; + }; // union raw32_3 + + union { + uint32_t raw32_4; + struct { + uint32_t pkt_len : 16; + uint32_t vlan_tag : 16; + }; + }; // union raw32_4 + + } __attribute__((packed)); // struct +} rdesc_adv_wb_t; + +/*********************** + * TX + * Descriptors + **********************/ +// 7.2.3.2.2 Legacy Transmit Descriptor Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t buffer_address; + + union { + uint64_t word2_raw; + + struct { + uint64_t length : 16; + uint64_t cso : 8; + + // cmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t ic : 1; + uint64_t rs : 1; + uint64_t rsvd_1 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t rsvd_2 : 1; + + // sta + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 4; + uint64_t css : 8; + uint64_t vlan : 16; + }; + }; + + } __attribute__((packed)); +} tdesc_legacy_t; + +// 7.2.3.2.3 Advanced Transmit Context Descriptor +typedef union { + uint64_t raw[2]; + + struct { + union { + uint64_t raw_1; + + struct { + uint64_t iplen : 9; + uint64_t maclen : 7; + uint64_t vlan : 16; + uint64_t ipsec_sa_index : 10; + uint64_t fcoef : 6; + uint64_t rsvd_1 : 16; + }; + }; + + union { + uint64_t raw_2; + + struct { + uint64_t ipsec_esp_len : 9; + + // tucmd + uint64_t snap : 1; + uint64_t ipv4 : 1; + uint64_t l4t : 2; // l4 packet type + uint64_t ipsec_type : 1; + uint64_t encyption : 1; + uint64_t fcoe : 1; + uint64_t rsvd_2 : 4; + + uint64_t dytp : 4; + uint64_t rsvd_3 : 5; + uint64_t dext : 1; + + uint64_t bcntlen : 6; + uint64_t idx : 1; + uint64_t rsvd_4 : 3; + uint64_t l4len : 8; + uint64_t mss : 16; + }; + }; + + } __attribute__((packed)); + +} tdesc_advance_ctxt_wb_t; + +// 7.2.3.2.4 Advanced Transmit Data Descriptor - Read Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t address; + + union { + uint64_t raw2; + struct { + uint64_t dtalen : 16; + uint64_t rsvd_1 : 2; + + // mac + uint64_t mac_ilsec : 1; + uint64_t mac_1588 : 1; + + uint64_t dtyp : 4; + + // dcmd + uint64_t eop : 1; + uint64_t ifcs : 1; + uint64_t rsvd_2 : 1; + uint64_t rs : 1; + uint64_t rsvd_3 : 1; + uint64_t dext : 1; + uint64_t vle : 1; + uint64_t tse : 1; + + // status + uint64_t dd : 1; + uint64_t rsvd_4 : 3; + + // idx + uint64_t idx : 3; + // uint64_t rsvd_5 : 2; + + uint64_t cc : 1; + + // popts + uint64_t ixsm : 1; + uint64_t txsm : 1; + uint64_t ipsec : 1; + uint64_t rsvd_6 : 3; + + uint64_t paylen : 18; + }; + }; + }; + +} tdesc_advance_tx_rf_t; + +// Advanced Transmit Data Descriptor - Write-back Format +typedef union { + uint64_t raw[2]; + + struct { + uint64_t rsvd_1; + + union { + uint64_t raw2; + + struct { + uint64_t rsvd_2 : 32; + + // status + uint64_t dd : 1; + uint64_t rsvd_3 : 3; + + uint64_t rsvd_4 : 28; + }; + }; + }; + +} tdesc_advance_tx_wbf_t; + +struct VirtioNetHeader { + static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kGsoNone = 0; + static const constexpr uint8_t kGsoTcpv4 = 1; + static const constexpr uint8_t kGsoUdp = 3; + static const constexpr uint8_t kGsoTcpv6 = 4; + static const constexpr uint8_t kGsoEvn = 0x80; + + uint8_t flags; + uint8_t gso_type; + uint16_t hdr_len; + uint16_t gso_size; + uint16_t csum_start; + uint16_t csum_offset; + uint16_t num_buffers; +}; + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_H_ diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc new file mode 100644 index 00000000..ab28293b --- /dev/null +++ b/src/native/IxgbeDriver.cc @@ -0,0 +1,2001 @@ +// Copyright Boston University SESA Group 2013 - 2018. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#include "IxgbeDriver.h" + +#include "../Align.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Clock.h" +#include "Debug.h" +#include "EventManager.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "Pfn.h" + +#include +#include +#include + +void ebbrt::IxgbeDriver::Create(pci::Device& dev) { + auto ixgbe_dev = new IxgbeDriver(dev); + + // physical device bringup + ixgbe_dev->Init(); + + ixgbe_dev->ebb_ = + IxgbeDriverRep::Create(ixgbe_dev, ebb_allocator->AllocateLocal()); + + // initialize per core rx and tx queues + for (size_t i = 0; i < Cpu::Count(); i++) { + ixgbe_dev->SetupMultiQueue(i); + } + + ixgbe_dev->FinishSetup(); + + // TODO remove? + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("intel 82599 card initialzed\n"); +} + +const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { + return mac_addr_; +} + +void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { + ebb_->Send(std::move(buf), std::move(pinfo)); +} + +void ebbrt::IxgbeDriver::Run() { ebb_->Run(); } + +// After packet transmission, need to mark bit in +// tx queue so that it can be used again +// TX_HEAD_WB does it automatically +void ebbrt::IxgbeDriverRep::ReclaimTx() { +#ifndef TX_HEAD_WB + size_t head = ixgmq_.tx_head_; + size_t tail = ixgmq_.tx_tail_; + tdesc_advance_tx_wbf_t* actx; + + // go through all descriptors owned by HW + while (head != tail) { + actx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); + + // if context + if (ixgmq_.tx_isctx_[head]) { + head = (head + 1) % ixgmq_.tx_size_; + } + // if non eop + else if (!(actx->dd)) { + head = (head + 1) % ixgmq_.tx_size_; + } + // eop + else if (actx->dd) { + head = (head + 1) % ixgmq_.tx_size_; + ixgmq_.tx_head_ = head; + } + } +#endif +} + +// every TX requires a context struct before +void ebbrt::IxgbeDriverRep::AddContext(uint8_t idx, uint8_t maclen, + uint16_t iplen, uint8_t l4len, + enum l4_type l4type) { + + tdesc_advance_ctxt_wb_t* actx; + + auto tail = ixgmq_.tx_tail_; + + // context buffer already allocated, need to zero + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); + + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + + memset(actx, 0, sizeof(tdesc_advance_ctxt_wb_t)); + ixgmq_.tx_isctx_[tail] = true; + + // refer to 82599 datasheet for these settings + actx->dytp = 0b0010; + actx->dext = 1; + actx->idx = idx; + actx->maclen = maclen; + actx->iplen = iplen; + + actx->ipv4 = 1; + actx->l4len = 0; // ignored when TSE not set + actx->l4t = l4type; + + // need to increment tail + ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; +} + +// Add a new packet to be transmitted +void ebbrt::IxgbeDriverRep::AddTx(const uint8_t* pa, uint64_t len, + uint64_t totallen, bool first, bool last, + uint8_t ctx, bool ip_cksum, + bool tcpudp_cksum) { + tdesc_advance_tx_rf_t* actx; + + auto tail = ixgmq_.tx_tail_; + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); + + ixgmq_.tx_isctx_[tail] = false; + + actx->raw[0] = 0x0; + actx->raw[1] = 0x0; + + // pa is physical address of where send buffer exists + actx->address = reinterpret_cast(pa); + actx->dtalen = len; + if (first) { + actx->paylen = totallen; + } + + // type is advanced + actx->dtyp = 0b0011; + actx->dext = 1; + + // rs bit should only be set when eop is set + if (last) { + actx->rs = 1; + } else { + actx->rs = 0; + } + + // checksum + actx->ifcs = 1; + + // set last packet bit + if (last) { + actx->eop = 1; + } else { + actx->eop = 0; + } + + // TODO enable ip checksum + if (ctx != -1) { + actx->idx = ctx; + actx->cc = 1; + actx->ixsm = ip_cksum; // no ip checksum + actx->txsm = tcpudp_cksum; // udp or tcp checksum offload + } + + ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; +} + +void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + auto dp = buf->GetDataPointer(); + auto len = buf->ComputeChainDataLength(); + auto count = buf->CountChainElements(); + bool ip_cksum = false; + bool tcpudp_cksum = false; + + ebbrt::kbugon(len >= 0xA0 * 1000, + "%s packet len bigger than max ether length\n", __FUNCTION__); + +// TODO threshold for triggering reclaim tx buffers +#ifndef TX_HEAD_WB + size_t free_desc = + IxgbeDriver::NTXDESCS - + (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); + // free descripts must have enough for count in chained iobufs + if (free_desc < (count + 1)) { + // reclaim buffers + ReclaimTx(); + + free_desc = IxgbeDriver::NTXDESCS - + (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); + // not enough descriptors got freed + if (free_desc < (count + 1)) { + return; + } + } +#endif + + if (pinfo.flags & PacketInfo::kNeedsIpCsum) { + ip_cksum = true; + } + + // NEED CHECKSUM + if (pinfo.flags & PacketInfo::kNeedsCsum) { + tcpudp_cksum = true; + + // check datasheet for numbers + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + } else if (pinfo.csum_offset == 16) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); + } else { + ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } + + // if buffer is chained + if (buf->IsChained()) { + size_t counter = 0; + for (auto& buf_it : *buf) { + counter++; + + // first buffer + if (counter == 1) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, + true, false, 0, ip_cksum, tcpudp_cksum); + } else { + // last buffer + if (counter == count) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, true, 0, ip_cksum, tcpudp_cksum); + } else { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, false, 0, ip_cksum, tcpudp_cksum); + } + } + } + } + // not chained + else { + AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); + } + } else { + // NO CHECKSUM FLAG SET + // if buffer is chained + if (buf->IsChained()) { + size_t counter = 0; + for (auto& buf_it : *buf) { + counter++; + + // first buffer + if (counter == 1) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, + true, false, 0, ip_cksum, tcpudp_cksum); + } else { + // last buffer + if (counter == count) { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, true, 0, ip_cksum, tcpudp_cksum); + } else { + AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), + len, false, false, 0, ip_cksum, tcpudp_cksum); + } + } + } + } + // not chained + else { + AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); + } + } + + // bump tx_tail + // indicates position beyond last descriptor hw + WriteTdt_1(Cpu::GetMine(), ixgmq_.tx_tail_); +} + +void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { + // Disable RXCTRL - 8.2.3.8.10 + bar0_.Write32(0x03000, m); +} + +void ebbrt::IxgbeDriver::WriteDmatxctl(uint32_t m) { + uint32_t reg; + + reg = bar0_.Read32(0x04A80); + ebbrt::kprintf("0x04A80: DMATXCTL 0x%08X - reset to 0x%08X\n", reg, reg & m); + + // DMATXCTL - 8.2.3.9.2 + bar0_.Write32(0x04A80, reg & m); +} +void ebbrt::IxgbeDriver::WriteDmatxctl_te(uint32_t m) { + auto reg = bar0_.Read32(0x04A80); + bar0_.Write32(0x04A80, reg | m); +} + +// 8.2.3.5.18 - General Purpose Interrupt Enable — GPIE (0x00898; RW) +void ebbrt::IxgbeDriver::WriteGpie(uint32_t m) { + auto reg = bar0_.Read32(0x00898); + bar0_.Write32(0x00898, reg | m); +} + +// 8.2.3.5.1 Extended Interrupt Cause Register- EICR (0x00800; RW1C) +void ebbrt::IxgbeDriver::ReadEicr() { + /* Note + * The EICR is also cleared on read if GPIE.OCD bit is cleared. When the + * GPIE.OCD bit is set, then only bits 16...29 are cleared on read. + */ + // 8.2.3.5.18 General Purpose Interrupt Enable — GPIE (0x00898;RW) + uint32_t reg; + reg = bar0_.Read32(0x00898); + ebbrt::kbugon((reg & 0x20), "GPIE.OCD not cleared\n"); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("First Read - 0x00800: EICR 0x%08X, ", reg); + + reg = bar0_.Read32(0x00800); + ebbrt::kprintf("Second Read - EICR 0x%08X\n", reg); +} +void ebbrt::IxgbeDriver::WriteEicr(uint32_t m) { + auto reg = bar0_.Read32(0x00800); + bar0_.Write32(0x00800, reg | m); +} + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +uint32_t ebbrt::IxgbeDriver::ReadEims() { return bar0_.Read32(0x00880); } +void ebbrt::IxgbeDriver::WriteEims(uint32_t m) { bar0_.Write32(0x00880, m); } + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriver::WriteEimc(uint32_t m) { bar0_.Write32(0x00888, m); } + +// 8.2.3.5.5 Extended Interrupt Auto Clear Register — EIAC (0x00810; RW) +void ebbrt::IxgbeDriver::WriteEiac(uint32_t m) { + auto reg = bar0_.Read32(0x00810); + bar0_.Write32(0x00810, reg | m); +} + +// 8.2.3.5.8 Extended Interrupt Mask Set/Read Registers — EIMS[n] (0x00AA0 + +// 4*(n-1), n=1...2; RWS) +void ebbrt::IxgbeDriver::WriteEimsn(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00AA0 + 4 * n); + bar0_.Write32(0x00AA0 + 4 * n, reg | m); +} + +// 8.2.3.5.12 +// Extended Interrupt Throttle Registers — EITR[n] +// (0x00820 + 4*n, n=0...23 and 0x012300 + 4*(n-24), +// n=24...128; RW) +void ebbrt::IxgbeDriver::WriteEitr(uint32_t n, uint32_t m) { + ebbrt::kbugon(n > 128, "%s error\n", __FUNCTION__); + + if (n < 24) { + bar0_.Write32(0x00820 + 4 * n, m); + } else { + bar0_.Write32(0x012300 + 4 * (n - 24), m); + } +} + +// 8.2.3.9.10 Transmit Descriptor Control — TXDCTL[n] (0x06028+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTxdctl(uint32_t n, uint32_t m) { + bar0_.Write32(0x06028 + (0x40 * n), m); +} +uint8_t ebbrt::IxgbeDriver::ReadTxdctl_enable(uint32_t n) { + auto reg = bar0_.Read32(0x06028 + 0x40 * n); + return (reg >> 25) & 0x1; +} + +// 8.2.3.8.6 Receive Descriptor Control — RXDCTL[n] (0x01028 + +// 0x40*n, n=0...63 and 0x0D028 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRxdctl_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01028 + (0x40 * n), m); +} +void ebbrt::IxgbeDriver::WriteRxdctl_1_enable(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + bar0_.Write32(0x01028 + (0x40 * n), reg | m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRxdctl_1_enable(uint32_t n) { + auto reg = bar0_.Read32(0x01028 + (0x40 * n)); + return (reg >> 25) & 0x1; +} + +void ebbrt::IxgbeDriver::WriteRxdctl_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D028 + (0x40 * n), m); +} + +// 8.2.3.27.14 PF VM L2 Control Register — PFVML2FLT[n] (0x0F000 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvml2flt(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F000 + 4 * n, m); +} + +// 8.2.3.9.14 Manageability Transmit TC Mapping — MNGTXMAP (0x0CD10; RW) +void ebbrt::IxgbeDriver::WriteMngtxmap(uint32_t m) { + bar0_.Write32(0x0CD10, m); +} + +// 8.2.3.1.1 Device Control Register — CTRL (0x00000 / 0x00004;RW) +void ebbrt::IxgbeDriver::WriteCtrl(uint32_t m) { bar0_.Write32(0x0, m); } +void ebbrt::IxgbeDriver::ReadCtrl() { + uint32_t reg; + reg = bar0_.Read32(0x0); + ebbrt::kprintf("%s = 0x%X\n", __FUNCTION__, reg); +} + +// 8.2.3.1.3 Extended Device Control Register — CTRL_EXT (0x00018; RW) +void ebbrt::IxgbeDriver::WriteCtrlExt(uint32_t m) { + auto reg = bar0_.Read32(0x00018); + bar0_.Write32(0x00018, reg | m); +} + +// 8.2.3.7.1 Filter Control Register — FCTRL (0x05080; RW) +void ebbrt::IxgbeDriver::WriteFctrl(uint32_t m) { bar0_.Write32(0x05080, m); } + +// 8.2.3.24.9 Flexible Host Filter Table Registers — FHFT (0x09000 — 0x093FC and +// 0x09800 — 0x099FC; RW) +void ebbrt::IxgbeDriver::WriteFhft_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x09000, m); +} +void ebbrt::IxgbeDriver::WriteFhft_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x09800, m); +} + +// 8.2.3.1.2 Device Status Register — STATUS (0x00008; RO) +bool ebbrt::IxgbeDriver::ReadStatusPcieMes() { + auto reg = bar0_.Read32(0x8); + return !(reg & 0x80000); +} +uint8_t ebbrt::IxgbeDriver::ReadStatusLanId() { + auto reg = bar0_.Read32(0x8); + return (reg >> 2) & 0x3; +} + +// 8.2.3.3.2 Flow Control Transmit Timer Value n — FCTTVn (0x03200 + 4*n, +// n=0...3; RW) +void ebbrt::IxgbeDriver::WriteFcttv(uint32_t n, uint32_t m) { + bar0_.Write32(0x03200 + (4 * n), m); +} + +// 8.2.3.3.3 Flow Control Receive Threshold Low — FCRTL[n] (0x03220 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrtl(uint32_t n, uint32_t m) { + bar0_.Write32(0x03220 + (4 * n), m); +} + +// 8.2.3.3.4 Flow Control Receive Threshold High — FCRTH[n] (0x03260 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteFcrth(uint32_t n, uint32_t m) { + bar0_.Write32(0x03260 + (4 * n), m); +} + +// 8.2.3.3.5 Flow Control Refresh Threshold Value — FCRTV (0x032A0; RW) +void ebbrt::IxgbeDriver::WriteFcrtv(uint32_t m) { bar0_.Write32(0x032A0, m); } + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { bar0_.Write32(0x03D00, m); } + +// 8.2.3.2.2 EEPROM Read Register — EERD (0x10014; RW) +void ebbrt::IxgbeDriver::WriteEerd(uint32_t m) { bar0_.Write32(0x10014, m); } +bool ebbrt::IxgbeDriver::ReadEerdDone() { + auto reg = bar0_.Read32(0x10014); + return !!(reg & 0x2); // return true when Read Done = 1 +} + +uint16_t ebbrt::IxgbeDriver::ReadEerdData() { + auto reg = bar0_.Read32(0x10014); + return (reg >> 16) & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadEeprom(uint16_t offset) { + WriteEerd(offset << 2 | 1); + // TODO: Timeout + while (ReadEerdDone() == 0) + ; + return ReadEerdData(); +} + +// 8.2.3.22.32 - Core Analog Configuration Register — CoreCTL (0x014F00; RW) +void ebbrt::IxgbeDriver::WriteCorectl(uint16_t m) { + bar0_.Write32(0x014F00, 0x0 | m); +} + +// 8.2.3.22.19 Auto Negotiation Control Register — AUTOC (0x042A0; RW) +void ebbrt::IxgbeDriver::WriteAutoc(uint32_t m) { + auto reg = bar0_.Read32(0x042A0); + bar0_.Write32(0x042A0, reg | m); +} +uint8_t ebbrt::IxgbeDriver::ReadAutocRestartAn() { + auto reg = bar0_.Read32(0x042A0); + return (reg >> 12) & 0x1; +} + +// 8.2.3.22.23 Auto Negotiation Link Partner Link Control Word 1 Register — +// ANLP1 (0x042B0; RO) +uint8_t ebbrt::IxgbeDriver::ReadAnlp1() { + auto reg = bar0_.Read32(0x042B0); + return (reg >> 16) & 0xFF; +} + +// 8.2.3.2.1 EEPROM/Flash Control Register — EEC (0x10010; RW) +uint8_t ebbrt::IxgbeDriver::ReadEecAutoRd() { + auto reg = bar0_.Read32(0x10010); + return (reg >> 9) & 0xFF; +} + +// 8.2.3.7.7 Multicast Table Array — MTA[n] (0x05200 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteMta(uint32_t n, uint32_t m) { + bar0_.Write32(0x05200 + (4 * n), m); +} + +// 8.2.3.7.11 VLAN Filter Table Array — VFTA[n] (0x0A000 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteVfta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A000 + (4 * n), m); +} + +// 8.2.3.27.15 PF VM VLAN Pool Filter — PFVLVF[n] (0x0F100 + 4*n, n=0...63; RW) +void ebbrt::IxgbeDriver::WritePfvlvf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F100 + 4 * n, m); +} + +// 8.2.3.27.16 PF VM VLAN Pool Filter Bitmap — PFVLVFB[n] (0x0F200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfvlvfb(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F200 + 4 * n, m); +} + +// 8.2.3.7.23 Rx Filter ECC Err Insertion 0 — RXFECCERR0 (0x051B8; RW) +void ebbrt::IxgbeDriver::WriteRxfeccerr0(uint32_t m) { + auto reg = bar0_.Read32(0x051B8); + bar0_.Write32(0x051B8, reg | m); +} + +// Checks the MAC's EEPROM to see if it supports a given SFP+ module type, if +// 1360 +// so it returns the offsets to the phy init sequence block. +// also based on +// http://lxr.free-electrons.com/source/drivers/net/ethernet/intel/ixgbe/ixgbe_phy.c?v=3.14#L1395 +// https://github.com/freebsd/freebsd/blob/386ddae58459341ec567604707805814a2128a57/sys/dev/ixgbe/ixgbe_82599.c#L173 +void ebbrt::IxgbeDriver::PhyInit() { + + uint16_t list_offset; + uint16_t data_offset = 0x0; + uint16_t data_value; + uint16_t sfp_id; + uint16_t sfp_type = 0x4; /* SPF_DA_CORE1 */ + + /* IXGBE_PHY_INIT_OFFSET_NL */ + list_offset = ReadEeprom(0x002B); + + if ((list_offset == 0x0) || (list_offset == 0xFFFF)) { + return; + } + + /* Shift offset to first ID word */ + list_offset++; + + sfp_id = ReadEeprom(list_offset); + + while (sfp_id != 0xFFFF) { + if (sfp_id == sfp_type) { + list_offset++; + data_offset = ReadEeprom(list_offset); + if ((data_offset == 0x0) || (data_offset == 0xFFFF)) { + ebbrt::kprintf("sfp init failed\n"); + return; + } else { + break; + } + } else { + list_offset += 2; + sfp_id = ReadEeprom(list_offset); + } + list_offset++; + } + + if (sfp_id == 0xFFFF) { + ebbrt::kprintf("sfp init failed\n"); + return; + } + + ebbrt::kprintf("data offset -> 0x%x\n", data_offset); + + SwfwLockPhy(); + + data_value = ReadEeprom(++data_offset); + while (data_value != 0xFFFF) { + ebbrt::kprintf("data_value -> 0x%x\n", data_value); + WriteCorectl(data_value); + data_value = ReadEeprom(++data_offset); + } + SwfwUnlockPhy(); + + ebbrt::clock::SleepMilli(20); + + WriteAutoc(0x0 << 13 | 0x1 << 12); + while (ReadAnlp1() != 0) + ; // TODO: timeout + + WriteAutoc(0x3 << 13 | 0x1 << 12); + while (ReadAutocRestartAn() != 0) + ; // TODO: timeout + + ebbrt::kprintf("PHY init done\n"); +} + +// 8.2.3.7.8 Receive Address Low — RAL[n] (0x0A200 + 8*n, n=0...127; RW) +uint32_t ebbrt::IxgbeDriver::ReadRal(uint32_t n) { + auto reg = bar0_.Read32(0x0A200 + 8 * n); + return reg; +} +void ebbrt::IxgbeDriver::WriteRal(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A200 + (8 * n), m); +} + +// 8.2.3.7.9 Receive Address High — RAH[n] (0x0A204 + 8*n, n=0...127; RW) +uint16_t ebbrt::IxgbeDriver::ReadRah(uint32_t n) { + auto reg = bar0_.Read32(0x0A204 + 8 * n); + return (reg)&0xFFFF; +} +uint8_t ebbrt::IxgbeDriver::ReadRahAv(uint32_t n) { + return (bar0_.Read32(0x0A204 + 8 * n) >> 31) & 0xFF; +} +void ebbrt::IxgbeDriver::WriteRah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A204 + (8 * n), m); +} + +// 8.2.3.7.10 MAC Pool Select Array — MPSAR[n] (0x0A600 + 4*n, n=0...255; RW) +void ebbrt::IxgbeDriver::WriteMpsar(uint32_t n, uint32_t m) { + bar0_.Write32(0x0A600 + 4 * n, m); +} + +// 8.2.3.7.19 Five tuple Queue Filter — FTQF[n] (0x0E600 + 4*n,n=0...127; RW) +void ebbrt::IxgbeDriver::WriteFtqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E600 + 4 * n, m); +} + +// 8.2.3.7.16 Source Address Queue Filter — SAQF[n] (0x0E000 + 4*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteSaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E000 + 4 * n, m); +} + +// 8.2.3.7.17 Destination Address Queue Filter — DAQF[n] (0x0E200 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDaqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E200 + 4 * n, m); +} + +// 8.2.3.7.18 Source Destination Port Queue Filter — SDPQF[n] (0x0E400 + 4*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteSdpqf(uint32_t n, uint32_t m) { + bar0_.Write32(0x0E400 + 4 * n, m); +} + +// 8.2.3.27.17 PF Unicast Table Array — PFUTA[n] (0x0F400 + 4*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WritePfuta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0F400 + 4 * n, m); +} + +// 8.2.3.7.3 Multicast Control Register — MCSTCTRL (0x05090; RW) +void ebbrt::IxgbeDriver::WriteMcstctrl(uint32_t m) { + auto reg = bar0_.Read32(0x05090); + bar0_.Write32(0x05090, reg & m); +} + +// 8.2.3.10.13 DCB Transmit Descriptor Plane Queue Select — RTTDQSEL (0x04904; +// RW) +void ebbrt::IxgbeDriver::WriteRttdqsel(uint32_t m) { + auto reg = bar0_.Read32(0x04904); + bar0_.Write32(0x04904, reg | m); +} + +// 8.2.3.10.14 DCB Transmit Descriptor Plane T1 Config — RTTDT1C (0x04908; RW) +void ebbrt::IxgbeDriver::WriteRttdt1c(uint32_t m) { bar0_.Write32(0x04908, m); } + +// 8.2.3.10.16 DCB Transmit Rate-Scheduler Config — RTTBCNRC (0x04984; RW) +void ebbrt::IxgbeDriver::WriteRttbcnrc(uint32_t m) { + bar0_.Write32(0x04984, m); +} + +// 8.2.3.10.9 DCB Transmit Descriptor Plane T2 Config - RTTDT2C[n] (0x04910 + +// 4*n, n=0...7; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x04910 + 4 * n, m); +} + +// 8.2.3.10.10 DCB Transmit Packet Plane T2 Config — RTTPT2C[n] (0x0CD20 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRttpt2c(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CD20 + 4 * n, m); +} + +// 8.2.3.10.6 DCB Receive Packet Plane T4 Config — RTRPT4C[n] (0x02140 + 4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteRtrpt4c(uint32_t n, uint32_t m) { + bar0_.Write32(0x02140 + 4 * n, m); +} + +// 8.2.3.10.1 DCB Receive Packet Plane Control and Status — RTRPCS (0x02430; RW) +void ebbrt::IxgbeDriver::WriteRtrpcs(uint32_t m) { bar0_.Write32(0x02430, m); } + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg & m); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl_1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} + +// void ebbrt::IxgbeDriver::WriteDcaRxctrl_1_RxdataWrro(uint32_t n, uint32_t m); +void ebbrt::IxgbeDriver::WriteDcaRxctrl_2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0D00C + 0x40 * n); + bar0_.Write32(0x0D00C + 0x40 * n, reg & m); +} + +// 8.2.3.7.5 Receive Checksum Control — RXCSUM (0x05000; RW) +void ebbrt::IxgbeDriver::WriteRxcsum(uint32_t m) { + auto reg = bar0_.Read32(0x05000); + bar0_.Write32(0x05000, reg | m); +} + +// 8.2.3.8.13 RSC Control — RSCCTL[n] (0x0102C + 0x40*n, n=0...63 +// and 0x0D02C + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRscctl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0102C + 0x40 * n); + bar0_.Write32(0x0102C + 0x40 * n, reg | m); +} + +// 8.2.3.7.4 Packet Split Receive Type Register — PSRTYPE[n] +// (0x0EA00 + 4*n, n=0...63 / 0x05480 + 4*n, n=0...15; RW) +void ebbrt::IxgbeDriver::WritePsrtype(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0EA00 + 0x40 * n); + bar0_.Write32(0x0EA00 + 0x40 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { + bar0_.Write32(0x0EA00 + 0x40 * n, 0x0); +} + +// 8.2.3.7.15 Redirection Table — RETA[n] (0x0EB00 + 4*n, n=0...31/ 0x05C00 + +// 4*n, n=0...31; RW) +void ebbrt::IxgbeDriver::WriteReta(uint32_t n, uint32_t m) { + bar0_.Write32(0x0EB00 + 4 * n, m); +} + +// 8.2.3.7.6 Receive Filter Control Register — RFCTL (0x05008; RW) +void ebbrt::IxgbeDriver::WriteRfctl(uint32_t m) { bar0_.Write32(0x05008, m); } + +// 8.2.3.9.16 Tx Packet Buffer Threshold — +// TXPBTHRESH (0x04950 +0x4*n, n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbthresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.7.12 Multiple Receive Queues Command Register- MRQC (0x0EC80 / 0x05818; +// RW) +void ebbrt::IxgbeDriver::WriteMrqc(uint32_t m) { + auto reg = bar0_.Read32(0x0EC80); + bar0_.Write32(0x0EC80, reg | m); +} + +// 8.2.3.9.15 Multiple Transmit Queues Command Register — MTQC (0x08120; RW) +void ebbrt::IxgbeDriver::WriteMtqc(uint32_t m) { bar0_.Write32(0x08120, m); } + +// 8.2.3.27.1 VT Control Register — PFVTCTL (0x051B0; RW) +void ebbrt::IxgbeDriver::WritePfvtctl(uint32_t m) { bar0_.Write32(0x051B0, m); } + +// 8.2.3.10.4 DCB Receive User Priority to Traffic Class — RTRUP2TC (0x03020; +// RW) +void ebbrt::IxgbeDriver::WriteRtrup2tc(uint32_t m) { + bar0_.Write32(0x03020, m); +} + +// 8.2.3.10.5 DCB Transmit User Priority to Traffic Class — RTTUP2TC (0x0C800; +// RW) +void ebbrt::IxgbeDriver::WriteRttup2tc(uint32_t m) { + bar0_.Write32(0x0C800, m); +} + +// 8.2.3.9.1 DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ (0x08100; RW) +void ebbrt::IxgbeDriver::WriteDtxmxszrq(uint32_t m) { + auto reg = bar0_.Read32(0x08100); + bar0_.Write32(0x08100, reg | m); +} + +// 8.2.3.27.9 PF PF Queue Drop Enable Register — PFQDE (0x02F04; RW) +void ebbrt::IxgbeDriver::WritePfqde(uint32_t m) { bar0_.Write32(0x02F04, m); } + +// 8.2.3.22.34 MAC Flow Control Register — MFLCN (0x04294; RW) +void ebbrt::IxgbeDriver::WriteMflcn(uint32_t m) { + auto reg = bar0_.Read32(0x04294); + bar0_.Write32(0x04294, reg | m); +} + +// 8.2.3.3.7 Flow Control Configuration — FCCFG (0x03D00; RW) +/*void ebbrt::IxgbeDriver::WriteFccfg(uint32_t m) { + auto reg = bar0_.Read32(0x03D00); + bar0_.Write32(0x03D00, reg | m); + }*/ + +// void ebbrt::IxgbeDriver::WriteDcaRxctrl_2_RxdataWrro(uint32_t n, uint32_t m); + +// 8.2.3.4.9 - Software Semaphore Register — SWSM (0x10140; RW) +bool ebbrt::IxgbeDriver::SwsmSmbiRead() { + return !!(bar0_.Read32(0x10140) & 0x1); +} +bool ebbrt::IxgbeDriver::SwsmSwesmbiRead() { + return !(bar0_.Read32(0x10140) & 0x2); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiSet() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg | 0x2); + bar0_.Write32(0x10140, reg | 0x2); +} +void ebbrt::IxgbeDriver::SwsmSmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFE); + bar0_.Write32(0x10140, reg & 0xFFFFFFFE); +} +void ebbrt::IxgbeDriver::SwsmSwesmbiClear() { + auto reg = bar0_.Read32(0x10140); + ebbrt::kprintf("%s: reg before: 0x%08X, reg after: 0x%08X\n", __FUNCTION__, + reg, reg & 0xFFFFFFFD); + bar0_.Write32(0x10140, reg & 0xFFFFFFFD); +} + +// 8.2.3.22.20 Link Status Register — LINKS (0x042A4; RO) +bool ebbrt::IxgbeDriver::ReadLinksLinkUp() { + auto reg = bar0_.Read32(0x042A4); + return ((reg >> 30) & 0x1) == 1; +} + +// 8.2.3.4.11 Software-Firmware Synchronization - SW_FW_SYNC (0x10160; RW) +uint32_t ebbrt::IxgbeDriver::ReadSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + return (reg & m) & 0x3FF; // masking bits 9:0 +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg | m); +} +void ebbrt::IxgbeDriver::WriteSwfwSyncSmBits2(uint32_t m) { + auto reg = bar0_.Read32(0x10160); + bar0_.Write32(0x10160, reg & m); +} + +// 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, +// n=0...63 and 0x0D00C + 0x40*(n-64), // n=0...63 and 0x0D00C + 0x40*(n-64), +// n=64...127 / 0x02200 + 4*n, [n=0...15]; RW) // n=64...127 / 0x02200 + 4*n, +// [n=0...15]; RW) +void ebbrt::IxgbeDriver::WriteDcaRxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteDcaRxctrlClear(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + bar0_.Write32(0x0100C + 0x40 * n, reg & m); +} + +// 8.2.3.11.4 DCA Control Register — DCA_CTRL (0x11074; RW) +void ebbrt::IxgbeDriver::WriteDcaCtrl(uint32_t m) { + auto reg = bar0_.Read32(0x11074); + bar0_.Write32(0x11074, reg | m); +} + +// 8.2.3.11.2 Tx DCA Control Registers — DCA_TXCTRL[n] (0x0600C + 0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteDcaTxctrl(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + bar0_.Write32(0x0600C + 0x40 * n, reg | m); +} + +// 8.2.3.8.1 Receive Descriptor Base Address Low — RDBAL[n] (0x01000 + 0x40*n, +// n=0...63 and 0x0D000 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbal_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01000 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbal_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D000 + 0x40 * n, m); +} + +// 8.2.3.8.2 Receive Descriptor Base Address High — RDBAH[n] (0x01004 + 0x40*n, +// n=0...63 and 0x0D004 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdbah_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01004 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdbah_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D004 + 0x40 * n, m); +} + +// 8.2.3.9.5 Transmit Descriptor Base Address Low — TDBAL[n] (0x06000+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06000 + 0x40 * n, m); +} + +// 8.2.3.9.6 Transmit Descriptor Base Address High — TDBAH[n] (0x06004+0x40*n, +// n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x06004 + 0x40 * n, m); +} + +// 8.2.3.9.7 Transmit Descriptor Length — TDLEN[n] (0x06008+0x40*n, n=0...127; +// RW) +void ebbrt::IxgbeDriver::WriteTdlen(uint32_t n, uint32_t m) { + bar0_.Write32(0x06008 + 0x40 * n, m); +} + +// 8.2.3.9.8 Transmit Descriptor Head — TDH[n] (0x06010+0x40*n, n=0...127; RO) +void ebbrt::IxgbeDriver::WriteTdh(uint32_t n, uint32_t m) { + bar0_.Write32(0x06010 + 0x40 * n, m); +} +uint16_t ebbrt::IxgbeDriver::ReadTdh(uint32_t n) { + auto reg = bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} + +// 8.2.3.9.11 Tx Descriptor Completion Write Back Address Low — +// TDWBAL[n] (0x06038+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbal(uint32_t n, uint32_t m) { + bar0_.Write32(0x06038 + 0x40 * n, m); +} +// 8.2.3.9.12 Tx Descriptor Completion Write Back Address High — +// TDWBAH[n] (0x0603C+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdwbah(uint32_t n, uint32_t m) { + bar0_.Write32(0x0603C + 0x40 * n, m); +} + +// 8.2.3.9.9 Transmit Descriptor Tail — TDT[n] (0x06018+0x40*n, n=0...127; RW) +void ebbrt::IxgbeDriver::WriteTdt(uint32_t n, uint32_t m) { + bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.8.3 Receive Descriptor Length — RDLEN[n] (0x01008 + 0x40*n, n=0...63 +// and 0x0D008 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdlen_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01008 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdlen_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D008 + 0x40 * n, m); +} + +// 8.2.3.8.7 Split Receive Control Registers — SRRCTL[n] (0x01014 + 0x40*n, +// n=0...63 and 0x0D014 + 0x40*(n-64), n=64...127 / 0x02100 + 4*n, [n=0...15]; +// RW) +void ebbrt::IxgbeDriver::WriteSrrctl_1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteSrrctlZero(uint32_t n) { + bar0_.Write32(0x01014 + 0x40 * n, 0x0); +} + +// 8.2.3.8.12 RSC Data Buffer Control Register — RSCDBU (0x03028; RW) +void ebbrt::IxgbeDriver::WriteRscdbu(uint32_t m) { + auto reg = bar0_.Read32(0x03028); + bar0_.Write32(0x03028, reg | m); +} + +void ebbrt::IxgbeDriver::WriteSrrctl_1_desctype(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, reg & m); +} + +// 8.2.3.8.8 Receive DMA Control Register — RDRXCTL (0x02F00; RW) +void ebbrt::IxgbeDriver::WriteRdrxctl(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg | m); +} + +void ebbrt::IxgbeDriver::WriteRdrxctlRSCFRSTSIZE(uint32_t m) { + auto reg = bar0_.Read32(0x02F00); + bar0_.Write32(0x02F00, reg & m); +} + +uint8_t ebbrt::IxgbeDriver::ReadRdrxctlDmaidone() { + auto reg = bar0_.Read32(0x02F00); + return (reg >> 3) & 0x1; +} + +// 8.2.3.8.9 Receive Packet Buffer Size — RXPBSIZE[n] (0x03C00 + 4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteRxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x03C00 + 4 * n, m); +} + +// 8.2.3.9.13 Transmit Packet Buffer Size — TXPBSIZE[n] (0x0CC00 + 0x4*n, +// n=0...7; RW) +void ebbrt::IxgbeDriver::WriteTxpbsize(uint32_t n, uint32_t m) { + bar0_.Write32(0x0CC00 + 0x4 * n, m); +} + +// 8.2.3.9.16 Tx Packet Buffer Threshold — TXPBTHRESH (0x04950+0x4*n, n=0...7; +// RW) +void ebbrt::IxgbeDriver::WriteTxpbThresh(uint32_t n, uint32_t m) { + bar0_.Write32(0x04950 + 0x4 * n, m); +} + +// 8.2.3.22.8 MAC Core Control 0 Register — HLREG0 (0x04240; RW) +void ebbrt::IxgbeDriver::WriteHlreg0(uint32_t m) { + auto reg = bar0_.Read32(0x04240); + bar0_.Write32(0x04240, reg | m); +} + +// 8.2.3.8.5 Receive Descriptor Tail — RDT[n] (0x01018 + 0x40*n, n=0...63 and +// 0x0D018 + 0x40*(n-64), n=64...127; RW) +void ebbrt::IxgbeDriver::WriteRdt_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01018 + 0x40 * n, m); +} +void ebbrt::IxgbeDriver::WriteRdt_2(uint32_t n, uint32_t m) { + bar0_.Write32(0x0D018 + 0x40 * n, m); +} + +// 8.2.3.8.4 Receive Descriptor Head — RDH[n] (0x01010 + 0x40*n, n=0...63 and +// 0x0D010 + 0x40*(n-64), n=64...127; RO) +void ebbrt::IxgbeDriver::WriteRdh_1(uint32_t n, uint32_t m) { + bar0_.Write32(0x01010 + 0x40 * n, m); +} +void ebbrt::IxgbeDriverRep::WriteRdh_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01010 + 0x40 * n, m); +} + +uint16_t ebbrt::IxgbeDriver::ReadRdh_1(uint32_t n) { + auto reg = bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} + +uint16_t ebbrt::IxgbeDriver::ReadRdt_1(uint32_t n) { + auto reg = bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriver::SwfwSemRelease() { + SwsmSwesmbiClear(); + SwsmSmbiClear(); + ebbrt::kprintf("%s\n", __FUNCTION__); +} + +// 8.2.3.5.16 Interrupt Vector Allocation Registers — IVAR[n] (0x00900 + 4*n, +// n=0...63; RW) +void ebbrt::IxgbeDriver::WriteIvarAlloc0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval0(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval1(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval2(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +void ebbrt::IxgbeDriver::WriteIvarAlloc3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} +void ebbrt::IxgbeDriver::WriteIvarAllocval3(uint32_t n, uint32_t m) { + auto reg = bar0_.Read32(0x00900 + 4 * n); + bar0_.Write32(0x00900 + 4 * n, reg | m); +} + +// 8.2.3.10.2 DCB Transmit Descriptor Plane Control and Status — RTTDCS +// (0x04900; RW) DMA-Tx +void ebbrt::IxgbeDriver::WriteRttdcs(uint32_t m) { + auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, reg | m); +} +void ebbrt::IxgbeDriver::WriteRttdcsArbdisEn(uint32_t m) { + auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, reg & m); +} + +// 8.2.3.10.3 DCB Transmit Packet Plane Control and Status- RTTPCS (0x0CD00; RW) +void ebbrt::IxgbeDriver::WriteRttpcs(uint32_t m) { bar0_.Write32(0x0CD00, m); } + +// 8.2.3.12.5 Security Rx Control — SECRXCTRL (0x08D00; RW) +void ebbrt::IxgbeDriver::WriteSecrxctrl_Rx_Dis(uint32_t m) { + auto reg = bar0_.Read32(0x08D00); + if (m) { + bar0_.Write32(0x08D00, reg | m); + } else { + bar0_.Write32(0x08D00, reg & ~(0x1 << 1)); + } +} + +// 8.2.3.12.6 Security Rx Status — SECRXSTAT (0x08D04; RO) +uint8_t ebbrt::IxgbeDriver::ReadSecrxstat_Sr_Rdy() { + auto reg = bar0_.Read32(0x08D04); + return reg & 0x1; +} + +// 8.2.3.23.59 Total Packets Received — TPR (0x040D0; RC) +uint32_t ebbrt::IxgbeDriver::ReadTpr() { + auto reg = bar0_.Read32(0x040D0); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +// 8.2.3.23.26 Good Packets Received Count — GPRC (0x04074; RO) +uint32_t ebbrt::IxgbeDriver::ReadGprc() { + auto reg = bar0_.Read32(0x04074); + ebbrt::kprintf("%s %d\n", __FUNCTION__, reg); + return reg; +} + +bool ebbrt::IxgbeDriver::SwfwSemAcquire() { + // polls SWSM.SMBI until 0b is read or timeout + // TODO: timeout after 10 ms + while (SwsmSmbiRead()) + ; + + // writes 1b to SWSM.SWESMBI bit + SwsmSwesmbiSet(); + + // polls SWSM.SWESMBI bit until read as 1b + // TODO: timeout of 3 secs + while (SwsmSwesmbiRead()) + ; + + return true; +} + +// 10.5.4 Software and Firmware Synchronization +bool ebbrt::IxgbeDriver::SwfwLockPhy() { + bool good = false; + +again: + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if ((ReadStatusLanId() == 0) && (ReadSwfwSyncSmBits(0x2) == 0) // SW_PHY_SM0 + && (ReadSwfwSyncSmBits(0x40) == 0)) // FW_PHY_SM0 + { + WriteSwfwSyncSmBits(0x2); // SW_PHY_SM0 + ebbrt::kprintf("SW_PHY_SMO written\n"); + good = true; + } else if ((ReadSwfwSyncSmBits(0x4) == 0) // SW_PHY_SM1 + && (ReadSwfwSyncSmBits(0x80) == 0)) // FW_PHY_SM1 + { + WriteSwfwSyncSmBits(0x4); // SW_PHY_SM1 + ebbrt::kprintf("SW_PHY_SM1 written\n"); + good = true; + } + + SwfwSemRelease(); + + if (!good) { + ebbrt::kprintf("%s: failed, trying again\n", __FUNCTION__); + ebbrt::clock::SleepMilli(20); + goto again; + } + + return true; +} +void ebbrt::IxgbeDriver::SwfwUnlockPhy() { + if (!SwfwSemAcquire()) { + ebbrt::kabort("SwfwSemAcquire failed\n"); + } else { + ebbrt::kprintf("SWSM Sem acquired\n"); + } + + if (ReadStatusLanId() == 0) { + WriteSwfwSyncSmBits2(~0x2); // SW_PHY_SM0 + } else { + WriteSwfwSyncSmBits2(~0x4); // SW_PHY_SM1 + } + + SwfwSemRelease(); + + ebbrt::clock::SleepMilli(10); +} + +void ebbrt::IxgbeDriver::StopDevice() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + // disable rx + WriteRxctrl(0x0); + + // disable tx + WriteDmatxctl(0xFFFFFFFE); + + // disable interrupts + WriteEimc(0x7FFFFFFF); + ReadEicr(); + + // disable each rx and tx queue + for (auto i = 0; i < 128; i++) { + // Bit 26, transmit software flush + WriteTxdctl(i, 0x04000000); + + if (i < 64) { + WriteRxdctl_1(i, 0x0); + } else { + WriteRxdctl_2(i - 64, 0x0); + } + } + + // from arrakis + ebbrt::clock::SleepMilli(2); + + // Master disable procedure + WriteCtrl(0x4); // PCIe Master Disable + while (ReadStatusPcieMes() != 1) + ; + ebbrt::kprintf("Ixgbe 82599 stop done\n"); +} + +void ebbrt::IxgbeDriver::GlobalReset() { + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + + WriteCtrl(0x8); // Link Reset + WriteCtrl(0x4000000); // Device Reset + + // Note: To ensure that a global device reset has fully completed and that the + // 82599 responds to subsequent accesses, programmers must wait + // before approximately 1 ms after setting attempting to check + // if the bit has cleared or to access (read or write) any other device + // register. + ebbrt::clock::SleepMilli(2); + ReadCtrl(); +} + +/** + * ixgbe_init_hw_generic - Generic hardware initialization + * @hw: pointer to hardware structure + * + * Initialize the hardware by resetting the hardware, filling the bus info + * structure and media type, clears all on chip counters, initializes receive + * address registers, multicast table, VLAN filter table, calls routine to set + * up link and flow control settings, and leaves transmit and receive units + * disabled and uninitialized + **/ +void ebbrt::IxgbeDriver::Init() { + uint64_t d_mac; + + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); + bar0_.Map(); // allocate virtual memory + ebbrt::clock::SleepMilli(200); + ebbrt::kprintf("Sleep 200 ms\n"); + + StopDevice(); + GlobalReset(); + ebbrt::clock::SleepMilli(50); + GlobalReset(); + ebbrt::clock::SleepMilli(250); + + // disable interrupts + WriteEimc(0x7FFFFFFF); + ReadEicr(); + + // Let firmware know we have taken over + WriteCtrlExt(0x1 << 28); // DRV_LOAD + + // No snoop disable from FreeBSD ?? + WriteCtrlExt(0x1 << 16); // NS_DIS + + // Initialize flow-control registers + for (auto i = 0; i < 8; i++) { + if (i < 4) { + WriteFcttv(i, 0x0); + } + WriteFcrtl(i, 0x0); + WriteFcrth(i, 0x0); + } + + WriteFcrtv(0x0); + WriteFccfg(0x0); + + // Initialize Phy + PhyInit(); + + // Wait for EEPROM auto read + while (ReadEecAutoRd() == 0) { + }; // TODO: Timeout + ebbrt::kprintf("EEPROM auto read done\n"); + + ebbrt::clock::SleepMilli(200); + d_mac = ReadRal(0) | ((uint64_t)ReadRah(0) << 32); + // ebbrt::kprintf("mac %p valid = %x\n", d_mac, ReadRahAv(0)); + for (auto i = 0; i < 6; i++) { + mac_addr_[i] = (d_mac >> (i * 8)) & 0xFF; + } + ebbrt::kprintf( + "Mac Address: %02X:%02X:%02X:%02X:%02X:%02X\n", + static_cast(mac_addr_[0]), static_cast(mac_addr_[1]), + static_cast(mac_addr_[2]), static_cast(mac_addr_[3]), + static_cast(mac_addr_[4]), static_cast(mac_addr_[5])); + + // Wait for DMA initialization + while (ReadRdrxctlDmaidone() == 0) { + }; // TODO: Timeout + + // Wait for link to come up + while (!ReadLinksLinkUp()) { + }; // TODO: timeout + ebbrt::kprintf("Link is up\n"); + ebbrt::clock::SleepMilli(50); + + // clears on read + WriteEicr(0xFFFFFFFF); + + /* setup msix */ + // switch to msix mode + WriteGpie(0x1 << 4); // Multiple_MSIX + WriteGpie(0x1 << 31); // PBA_support + WriteGpie(0x1 << 5); // OCD + + // TODO: Set up management interrupt handler + + // Enable auto masking of interrupt + WriteGpie(0x1 << 30); // EIAME + +#ifdef RSC_EN + // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us + // Recommended value based on 7.3.2.1.1 + WriteGpie(0x1 << 11); +#endif + + /* FreeBSD: + * ixgbe_common.c - s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw) + * Places the MAC address in receive address register 0 and clears the rest + * of the receive address registers. Clears the multicast table. Assumes + * the receiver is in reset when the routine is called. + */ + // Initialize RX filters + + /* Zero out the other receive addresses. */ + for (auto i = 1; i < 128; i++) { + WriteRal(i, 0x0); + WriteRah(i, 0x0); + } + + // clear mta + for (auto i = 0; i < 128; i++) { + WriteMta(i, 0x0); + } + + // No init uta tables? + + // set vlan filter table + for (auto i = 0; i < 128; i++) { + WriteVfta(i, 0x0); + } + + for (auto i = 0; i < 64; i++) { + // WritePfvlvf(i, 0x1 << 31); // VI_En bit 31 + WritePfvlvf(i, 0x0); + WritePfvlvfb(i, 0x0); + // WritePsrtypeZero(0x0); + } + + // PF Unicast Table Array + for (auto i = 0; i < 128; i++) { + WritePfuta(i, 0x0); + } + + // not sure why initing these tables? + for (auto i = 0; i < 128; i++) { + WriteFhft_1(i, 0x0); + if (i < 64) { + WriteFhft_2(i, 0x0); + } + } + + // enable ECC Reporting TODO - causes interrupts to be broken?? + // WriteRxfeccerr0(0x1 << 9); + + /**** Initialize RX filters ****/ + // FreeBSD if_ix.c - ixgbe_initialize_receive_units - Enable broadcast accept + WriteFctrl(0x1 << 10); // Set BAM = 1 + + // TODO VLNCTRL + WriteMcstctrl(0x0); + +#ifndef RSC_EN + WriteRxcsum(0x1 << 12); // IP payload checksum enable +#endif +// TODO RQTC + +#ifdef RSC_EN + WriteRfctl(0x0); +#else + WriteRfctl(0x1 << 5); +#endif + + for (auto i = 0; i < 256; i++) { + WriteMpsar(i, 0x0); + } + + // TODO RSSRK + + for (auto i = 0; i < 32; i++) { + WriteReta(i, 0x0); + } + + for (auto i = 0; i < 128; i++) { + WriteFtqf(i, 0x0); + WriteSaqf(i, 0x0); + WriteDaqf(i, 0x0); + WriteSdpqf(i, 0x0); + } + + // TODO SYNQF + // TODO ETQF + // TODO ETQS + + // Make sure RX CRC strip enabled in HLREG0 and RDRXCTL + WriteRdrxctlRSCFRSTSIZE(~(0x1F << 17)); // s/w set to 0 + WriteRdrxctl(0x1 << 1); // CRCStrip + WriteHlreg0(0x1 << 1); // CRCStrip + WriteRdrxctl(0x1 << 25); // RSCACKC s/w set to 1 + WriteRdrxctl(0x1 << 26); // FCOE_WRFIX s/w set to 1 + // TODO RSCDBU + + /***** END RX FILTER *****/ + + // Configure buffers etc. according to specification + // Section 4.6.11.3.4 (no DCB, no virtualization) + + /* Transmit Init: Set RTTDCS.ARBDIS to 1b. + * Program DTXMXSZRQ, TXPBSIZE, TXPBTHRESH, MTQC, and MNGTXMAP, according + * to the DCB and virtualization modes (see Section 4.6.11.3). + * Clear RTTDCS.ARBDIS to 0b. + */ + WriteRttdcs(0x1 << 6); + WriteDtxmxszrq(0xFFF); + WriteTxpbsize(0, 0xA0 << 10); + WriteTxpbThresh(0, 0xA0); + for (auto i = 1; i < 8; i++) { + WriteTxpbsize(i, 0x0); + WriteTxpbThresh(i, 0x0); + } + WriteMtqc(0x0); + WriteMngtxmap(0x0); + WriteRttdcsArbdisEn(~(0x1 << 6)); + + /* Receive Init: Program RXPBSIZE, MRQC, PFQDE, RTRUP2TC, MFLCN.RPFCE, + * and MFLCN.RFCE according to the DCB and virtualization modes + */ + WriteRxpbsize(0, 0x200 << 10); + for (auto i = 1; i < 8; i++) { + WriteRxpbsize(i, 0x0); + } + WriteMrqc(0x0); + WritePfqde(0x0); + WriteRtrup2tc(0x0); + WriteMflcn(0x0 << 2); + WriteMflcn(0x1 << 3); + // end DCB off, VT off + + // TODO Enable Jumbo Packets + + // disable relaxed ordering + for (auto i = 0; i < 128; i++) { + WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); // Txdesc_Wbro + + if (i < 64) { + WriteDcaRxctrl_1( + i, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_1( + i, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } else { + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 15)); // Rx split header relax order enable, bit 15 + WriteDcaRxctrl_2( + i - 64, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 + } + } + +#ifdef DCA_ENABLE + // DCA_MODE = DCA 1.0 + WriteDcaCtrl(0x1 << 1); +#endif +} + +void ebbrt::IxgbeDriver::FinishSetup() { + // No snoop disable from FreeBSD ?? + WriteCtrlExt(0x1 << 16); // NS_DIS + for (size_t i = 0; i < Cpu::Count(); i++) { + WriteDcaRxctrlClear(i, ~(0x1 << 12)); // clear bit 12 + } + WriteEims(0xFFFF); +} + +// initializes per core rx/tx queues and interrupts +void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { + if (!rcv_vector) { + rcv_vector = + event_manager->AllocateVector([this]() { ebb_->ReceivePoll(); }); + } + + // allocate memory for descriptor rings + ixgmq[i].reset(new e10Kq(i, Cpu::GetMyNode())); + + // not going to set up receive descripts greater than 63 + ebbrt::kbugon(i >= 64, "can't set up descriptors greater than 63\n"); + + // update register RDBAL, RDBAH with receive descriptor base address + WriteRdbal_1(i, ixgmq[i]->rxaddr_ & 0xFFFFFFFF); + WriteRdbah_1(i, (ixgmq[i]->rxaddr_ >> 32) & 0xFFFFFFFF); + + // set to number of bytes allocated for receive descriptor ring + WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); + + // program srrctl register + WriteSrrctlZero(i); + WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket + WriteSrrctl_1(i, (128 / 64) << 8); // bsizeheader + +// TODO headsplit adv +#ifdef RSC_EN + WriteSrrctl_1(i, 0x1 << 25); // desctype adv +#else + // legacy is default?? + WriteSrrctl_1(i, ~(0x7 << 25)); // desctype legacy +#endif + + WriteSrrctl_1(i, 0x1 << 28); // Drop_En + +#ifdef RSC_EN + // RSC set up + WriteRscctl(i, 0x3 << 2); // MAXDESC + WriteRscctl(i, 0x1); // RSCEN + WritePsrtypeZero(i); + WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. +#endif + + // Set head and tail pointers + WriteRdt_1(i, 0x0); + WriteRdh_1(i, 0x0); + + // Set Enable bit in receive queue + WriteRxdctl_1_enable(i, 0x1 << 25); + // TODO: Timeout + while (ReadRxdctl_1_enable(i) == 0) + ; + + // setup RX interrupts for queue i + dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); + + // don't set up interrupts for tx since we have head writeback?? + auto qn = i / 2; // put into correct IVAR + + if ((i % 2) == 0) { // check if 2xN or 2xN + 1 + WriteIvarAlloc0(qn, i); // rx interrupt allocation corresponds to index i * + // 2 in MSI-X table + WriteIvarAllocval0(qn, 0x1 << 7); + } else { + WriteIvarAlloc2(qn, i << 16); + WriteIvarAllocval2(qn, 0x1 << 23); + } + + // must be greater than rsc delay + // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us + WriteEitr(i, 0x7 << 3); // 16 * 2us = 32 us + + // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. + // The hardware setting for interrupts 16...63 is always auto clear. + if (i < 16) { + // enable auto clear + WriteEiac(0x1 << i); + } + + // enable interrupt + WriteEimsn(i / 32, (0x1 << (i % 32))); + + // make sure interupt is cleared + if (i < 16) { + WriteEicr(0x1 << i); + } + + // Enable RX + // disable RX_DIS + WriteSecrxctrl_Rx_Dis(0x1 << 1); + // TODO Timeout + while (ReadSecrxstat_Sr_Rdy() == 0) + ; + WriteRxctrl(0x1); + // enable RX_DIS + WriteSecrxctrl_Rx_Dis(0x0 << 1); + + // add buffer to each descriptor + for (size_t j = 0; j < NRXDESCS - 1; j++) { + auto rxphys = + reinterpret_cast((ixgmq[i]->circ_buffer_[j])->MutData()); + auto tail = ixgmq[i]->rx_tail_; + +// update buffer address for descriptor +#ifdef RSC_EN + rdesc_adv_rf_t* tmp; + tmp = reinterpret_cast(&(ixgmq[i]->rx_ring_[tail])); + + tmp->packet_buffer = rxphys; + // TODO only use this if enabling header splitting? + tmp->header_buffer = 0; +#else + ixgmq[i]->rx_ring_[tail].buffer_address = rxphys; +#endif + + ixgmq[i]->rx_tail_ = (tail + 1) % ixgmq[i]->rx_size_; + } + + // bump tail pts via register rdt to enable descriptor fetching by setting to + // length of ring minus one + WriteRdt_1(i, ixgmq[i]->rx_tail_); + +#ifdef DCA_ENABLE + auto myapic = ebbrt::Cpu::GetByIndex(i)->apic_id(); + + WriteDcaRxctrl(i, 0x1 << 5); // Descriptor DCA EN + WriteDcaRxctrl(i, 0x1 << 6); // Rx Header DCA EN + WriteDcaRxctrl(i, 0x1 << 7); // Payload DCA EN + + WriteDcaRxctrl(i, myapic << 24); // CPUID = apic id + + WriteDcaTxctrl(i, 0x1 << 5); // DCA Enable + WriteDcaTxctrl(i, myapic << 24); // CPUID = apic id +#endif + + // program base address registers + WriteTdbal(i, ixgmq[i]->txaddr_ & 0xFFFFFFFF); + WriteTdbah(i, (ixgmq[i]->txaddr_ >> 32) & 0xFFFFFFFF); + + // length must also be 128 byte aligned + WriteTdlen(i, ixgmq[i]->tx_size_bytes_); + +#ifdef TX_HEAD_WB + WriteTdwbal(i, (ixgmq[i]->txhwbaddr_ & 0xFFFFFFFF) | 0x1); + WriteTdwbah(i, (ixgmq[i]->txhwbaddr_ >> 32) & 0xFFFFFFFF); +#endif + + // enable transmit path + WriteDmatxctl_te(0x1); + + // transmit queue enable + WriteTxdctl(i, 0x1 << 25); + + // poll until set, TODO: Timeout + while (ReadTxdctl_enable(i) == 0) + ; + + // TODO: set up dca txctrl FreeBSD? + // clear TXdescWBROen + WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); +} + +// after packet received, need to make sure device can reuse +void ebbrt::IxgbeDriverRep::ReclaimRx() { + for (size_t i = 0; i < ixgmq_.rsc_chain_.size(); i++) { + // bump tail ptr + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + auto n = ixgmq_.rsc_chain_[i].first; + + // reset buffer + ixgmq_.rx_ring_[n].raw[0] = 0; + ixgmq_.rx_ring_[n].raw[1] = 0; + // allocate new rx buffer + ixgmq_.circ_buffer_[n] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[n])->MutData()); + // update buffer with new adder + ixgmq_.rx_ring_[n].buffer_address = rxphys; + } +} + +// keep check for new packets to receive +// may wait for RSC to be done +uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, + uint64_t* rxflag, bool* process_rsc, + uint32_t* rnt) { +#ifdef RSC_EN + rdesc_adv_wb_t* tmp; + tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + + // if rx packet not ready + if (!(tmp->dd)) { + return 1; + } + + auto rsccnt = tmp->rsccnt; + + // not RSC, handled normally + if (rsccnt == 0 && tmp->eop) { + *len = tmp->pkt_len; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } + // not sure what case this is, no context started, eop is set but rsccnt > 0 + else if (rsccnt > 0 && tmp->eop && !(ixgmq_.rsc_used)) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + *len = tmp->pkt_len; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } + // START NEW RSC CONTEXT + else if (rsccnt > 0 && !(tmp->eop) && !(ixgmq_.rsc_used)) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + ixgmq_.rsc_used = true; + ixgmq_.rsc_chain_.clear(); + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 1; + } + // APPEND TO EXISTING RSC CONTEXT + else if (rsccnt > 0 && !(tmp->eop) && ixgmq_.rsc_used) { + kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, + "RSC: NEXTP > RX_SIZE\n"); + + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 1; + } + // LAST RSC CONTEXT + else if (rsccnt > 0 && tmp->eop && ixgmq_.rsc_used) { + ixgmq_.rsc_used = false; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp->l4i) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp->l4e)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp->ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp->ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + ixgmq_.rsc_chain_.emplace_back( + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + *process_rsc = true; + + return 0; + } else { + // shouldn't hit here + ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); + } + +#else + // no RSC so just get one packet at a time + int c = static_cast(Cpu::GetMine()); + rdesc_legacy_t tmp; + tmp = ixgmq_.rx_ring_[ixgmq_.rx_head_]; + + if (tmp.dd && tmp.eop) { + *len = tmp.length; + + /* set rx flags */ + // TCP/UDP checksum + if (tmp.l4cs) { + *rxflag |= RXFLAG_L4CS; + if (!(tmp.tcpe)) { + *rxflag |= RXFLAG_L4CS_VALID; + } + } + + // Ipv4 checksum + if (tmp.ipcs) { + *rxflag |= RXFLAG_IPCS; + if (!(tmp.ipe)) { + *rxflag |= RXFLAG_IPCS_VALID; + } + } + + // reset descriptor + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + + // bump head ptr + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + return 0; + } +#endif + + return 1; +} + +void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t len; + uint64_t bAddr; + uint64_t rxflag; + bool process_rsc; + uint32_t count; + uint32_t rnt; + static bool ret = false; + process_rsc = false; + +retry: + rxflag = 0; + count = 0; + rnt = 0; + + // get address of buffer with data + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt) == 0) { + // hit last rsc context, start to process all buffers + if (process_rsc) { + ret = true; + process_rsc = false; + count++; + + auto n = ixgmq_.rsc_chain_[0].first; + auto rsclen = 0; + + // TODO hack - need to set actual length of data else there'll be 0's + // attached + ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[0].second); + + rsclen += ixgmq_.rsc_chain_[0].second; + + // TODO - maybe find better way to rewrite this + auto b = std::move(ixgmq_.circ_buffer_[n]); + + for (size_t x = 1; x < ixgmq_.rsc_chain_.size(); x++) { + count++; + + auto n = ixgmq_.rsc_chain_[x].first; + // TODO hack - need to set actual length of data + ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[x].second); + rsclen += ixgmq_.rsc_chain_[x].second; + b->PrependChain(std::move(ixgmq_.circ_buffer_[n])); + } + + ReclaimRx(); + + root_.itf_.Receive(std::move(b), rxflag); + } else { + // done with buffer addr above, now to reuse it + auto tail = ixgmq_.rx_tail_; + + // bump tail ptr + ixgmq_.rx_tail_ = (tail + 1) % ixgmq_.rx_size_; + + count++; + + if (count > 0) { + auto tail = ixgmq_.rx_tail_; + + // TODO hack - need to set actual length of data otherwise it'll send + // leftover 0's + ixgmq_.circ_buffer_[tail]->SetLength(len); + + // TODO hack - need to reallocate IOBuf after its been moved to Receive + auto b = std::move(ixgmq_.circ_buffer_[tail]); + + ixgmq_.circ_buffer_[tail] = + std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[tail])->MutData()); + + ixgmq_.rx_ring_[tail].buffer_address = rxphys; + + root_.itf_.Receive(std::move(b), rxflag); + } + } + } + + // TODO: Update tail register here or above? + if (count > 0) { + // update reg + WriteRdt_1(Cpu::GetMine(), ixgmq_.rx_tail_); + } + + // keep looping back once we see start of rsc context + if (likely(ret)) { + goto retry; + } +} + +ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) + : root_(root), ixgq_(root_.GetQueue()), + ixgmq_(root.GetMultiQueue(Cpu::GetMine())), + receive_callback_([this]() { ReceivePoll(); }) { + this->ReceivePoll(); +} + +uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} +uint16_t ebbrt::IxgbeDriverRep::ReadRdt_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriverRep::WriteRdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01018 + 0x40 * n, m); +} + +void ebbrt::IxgbeDriverRep::Run() { + while (1) { + ReceivePoll(); + } +} +void ebbrt::IxgbeDriverRep::WriteTdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.5.9 Extended Interrupt Mask Clear Registers — EIMC[n] +// (0x00AB0 + 4*(n-1), n=1...2; WO) +void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { + auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); + root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); +} diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h new file mode 100644 index 00000000..46670a2d --- /dev/null +++ b/src/native/IxgbeDriver.h @@ -0,0 +1,473 @@ +// Copyright Boston University SESA Group 2013 - 2017. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ + +#include "../Align.h" +#include "../MulticoreEbb.h" +#include "../SpinLock.h" +#include "../StaticIOBuf.h" +#include "../UniqueIOBuf.h" +#include "Debug.h" +#include "Fls.h" +#include "Ixgbe.h" +#include "Net.h" +#include "PageAllocator.h" +#include "Pci.h" +#include "Pfn.h" +#include "SlabAllocator.h" + +// Receive Side Scaling (RSC) enabled +#define RSC_EN +// Direct Cache Access (DCA) enabled +#define DCA_ENABLE +// Transmit Header Writeback enabled +#define TX_HEAD_WB + +namespace ebbrt { + +// Per-core receive and transmit queue +typedef struct { + rdesc_legacy_t* rx_ring; + size_t rx_head; + size_t rx_tail; + size_t rx_size; + + tdesc_legacy_t* tx_ring; + uint32_t* tx_head; + size_t tx_tail; + size_t tx_last_tail; + size_t tx_size; + bool* tx_isctx; + + // buffers holding packet data + std::vector> circ_buffer; +} e10k_queue_t; + +class IxgbeDriverRep; + +class IxgbeDriver : public EthernetDevice { + public: + explicit IxgbeDriver(pci::Device& dev) + : itf_(network_manager->NewInterface(*this)), dev_(dev), + bar0_(dev.GetBar(0)) { + dev_.SetBusMaster(true); + + // set up interrupts, polling won't work after this + auto msix = dev_.MsixEnable(); + kbugon(!msix, "Ixgbe without msix is unsupported\n"); + + // each core gets a queue struct + ixgmq.resize(Cpu::Count()); + } + + static void Create(pci::Device& dev); + static bool Probe(pci::Device& dev) { + if (dev.GetVendorId() == kIxgbeVendorId && + dev.GetDeviceId() == kIxgbeDeviceId && dev.GetFunc() == 0) { + IxgbeDriver::Create(dev); + return true; + } + return false; + } + + void Run(); + void Send(std::unique_ptr buf, PacketInfo pinfo) override; + const EthernetAddress& GetMacAddress() override; + + protected: + static const constexpr uint16_t kIxgbeVendorId = 0x8086; + static const constexpr uint16_t kIxgbeDeviceId = 0x10F8; // 0x10FB; + + /* FreeBSD: + * RxDescriptors Valid Range: 64-4096 Default Value: 256 This value is the + * number of receive descriptors allocated for each RX queue. Increasing this + * value allows the driver to buffer more incoming packets. Each descriptor + * is 16 bytes. A receive buffer is also allocated for each descriptor. + * + * Note: with 8 rings and a dual port card, it is possible to bump up + * against the system mbuf pool limit, you can tune nmbclusters + * to adjust for this. + */ + static const constexpr uint32_t NTXDESCS = 256; + static const constexpr uint32_t NRXDESCS = 256; + // static const constexpr uint32_t NTXDESCS = 4096; + // static const constexpr uint32_t NRXDESCS = 4096; + static const constexpr uint32_t RXBUFSZ = 4096; + // static const constexpr uint32_t RXBUFSZ = 16384; + + // Class with per core queue data structures + class e10Kq { + public: + e10Kq(size_t idx, Nid nid) + : rx_head_(0), rx_tail_(0), rx_size_(NRXDESCS), tx_tail_(0), + tx_last_tail_(0), tx_size_(NTXDESCS), idx_(idx), rxflag_(0), + rsc_used(false), hanc{0} { + + circ_buffer_.reserve(NRXDESCS); + for (uint32_t k = 0; k < NRXDESCS; k++) { + circ_buffer_.emplace_back(MakeUniqueIOBuf(RXBUFSZ, true)); + } + + // rsc_chain_ is a map between receive descriptor number and + // packet len, need packet len to extract out + // packet data else code will read redundant + // zeros if packet len does not use full buffer + // TODO: should be optimized + rsc_chain_.reserve(NRXDESCS); + + // RX ring buffer allocation + auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); + auto order = Fls(sz - 1) - pmem::kPageShift + 1; + auto page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + auto addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + rx_ring_ = static_cast(addr); + + // TX ring buffer allocation + sz = align::Up(sizeof(tdesc_legacy_t) * NTXDESCS, 4096); + order = Fls(sz - 1) - pmem::kPageShift + 1; + page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + tx_ring_ = static_cast(addr); + + // TX adv context buffer allocation + sz = align::Up(sizeof(bool) * NTXDESCS, 4096); + order = Fls(sz - 1) - pmem::kPageShift + 1; + page = page_allocator->Alloc(order, nid); + kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", + __FUNCTION__); + addr = reinterpret_cast(page.ToAddr()); + memset(addr, 0, sz); + tx_isctx_ = static_cast(addr); + +#ifdef TX_HEAD_WB + // TODO: not sure how much exactly to allocate for head wb addr + tx_head_ = (uint32_t*)malloc(4 * sizeof(uint32_t)); + memset(tx_head_, 0, 4 * sizeof(uint32_t)); + txhwbaddr_ = reinterpret_cast(tx_head_); + // txhwbaddr must be byte aligned + ebbrt::kbugon((txhwbaddr_ & 0x3) != 0, "txhwbaddr not byte aligned\n"); + kassert((txhwbaddr_ & 0x3) == 0); +#else + tx_head_ = 0; +#endif + + // get starting address, need to write to device registers + rxaddr_ = reinterpret_cast(rx_ring_); + txaddr_ = reinterpret_cast(tx_ring_); + rx_size_bytes_ = sizeof(rdesc_legacy_t) * NRXDESCS; + tx_size_bytes_ = sizeof(tdesc_legacy_t) * NTXDESCS; + + // must be 128 byte aligned + ebbrt::kbugon((rxaddr_ & 0x7F) != 0, "rx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((txaddr_ & 0x7F) != 0, "tx_addr_ not 128 byte aligned\n"); + ebbrt::kbugon((rx_size_bytes_ & 0x7F) != 0, + "rx_size_bytes_ not 128 byte aligned\n"); + ebbrt::kbugon((tx_size_bytes_ & 0x7F) != 0, + "tx_size_bytes_ not 128 byte aligned\n"); + } + + size_t rx_head_; + size_t rx_tail_; + size_t rx_size_; + size_t tx_tail_; + size_t tx_last_tail_; + size_t tx_size_; + size_t idx_; + size_t rx_size_bytes_; + size_t tx_size_bytes_; + uint64_t rxaddr_; + uint64_t txaddr_; + uint64_t txhwbaddr_; + uint64_t rxflag_; + + std::vector> circ_buffer_; + std::vector> rsc_chain_; + + rdesc_legacy_t* rx_ring_; + tdesc_legacy_t* tx_ring_; + bool* tx_isctx_; + bool rsc_used; + int hanc; +#ifdef TX_HEAD_WB + uint32_t* tx_head_; +#else + size_t tx_head_; +#endif + }; + + private: + EbbRef ebb_; + NetworkManager::Interface& itf_; + EthernetAddress mac_addr_; + + void Init(); + void PhyInit(); + void StopDevice(); + void GlobalReset(); + void SetupMultiQueue(uint32_t i); + void FinishSetup(); + + // device register writing code below + bool SwsmSmbiRead(); + void SwsmSmbiClear(); + + void SwsmSwesmbiSet(); + bool SwsmSwesmbiRead(); + void SwsmSwesmbiClear(); + + uint32_t ReadSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits(uint32_t m); + void WriteSwfwSyncSmBits2(uint32_t m); + + bool SwfwLockPhy(); + void SwfwUnlockPhy(); + bool SwfwSemAcquire(); + void SwfwSemRelease(); + + void WriteRxctrl(uint32_t m); + void WriteDmatxctl(uint32_t m); + void WriteDmatxctl_te(uint32_t m); + + void WriteEimc(uint32_t m); + void WriteEitr(uint32_t n, uint32_t m); + + void WriteTxdctl(uint32_t n, uint32_t m); + + void WriteRxdctl_1(uint32_t n, uint32_t m); + void WriteRxdctl_1_enable(uint32_t n, uint32_t m); + + void WriteRxdctl_2(uint32_t n, uint32_t m); + void WriteCtrl(uint32_t m); + void WriteCtrlExt(uint32_t m); + void WriteFcttv(uint32_t n, uint32_t m); + void WriteFcrtl(uint32_t n, uint32_t m); + void WriteFcrth(uint32_t n, uint32_t m); + void WriteFcrtv(uint32_t m); + void WriteFccfg(uint32_t m); + void WriteEerd(uint32_t m); + + void WriteCorectl(uint16_t m); + + void WriteAutoc(uint32_t m); + + void WriteEicr(uint32_t m); + void WriteGpie(uint32_t m); + + void WriteEims(uint32_t m); + + void WriteRal(uint32_t n, uint32_t m); + void WriteRah(uint32_t n, uint32_t m); + + void WriteMta(uint32_t n, uint32_t m); + void WriteVfta(uint32_t n, uint32_t m); + void WritePfvlvf(uint32_t n, uint32_t m); + void WritePfvlvfb(uint32_t n, uint32_t m); + void WriteMpsar(uint32_t n, uint32_t m); + void WriteFtqf(uint32_t n, uint32_t m); + void WriteSaqf(uint32_t n, uint32_t m); + void WriteDaqf(uint32_t n, uint32_t m); + void WriteSdpqf(uint32_t n, uint32_t m); + + void WriteFctrl(uint32_t m); + void WriteFhft_1(uint32_t n, uint32_t m); + void WriteFhft_2(uint32_t n, uint32_t m); + + void WritePfuta(uint32_t n, uint32_t m); + void WriteMcstctrl(uint32_t m); + + void WriteRttdqsel(uint32_t m); + void WriteRttbcnrc(uint32_t m); + + void WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m); + void WriteDcaTxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrl(uint32_t n, uint32_t m); + void WriteDcaRxctrlClear(uint32_t n, uint32_t m); + void WriteDcaRxctrl_1(uint32_t n, uint32_t m); + void WriteDcaRxctrl_2(uint32_t n, uint32_t m); + void WriteDcaCtrl(uint32_t m); + + void WriteRdbal_1(uint32_t n, uint32_t m); + void WriteRdbal_2(uint32_t n, uint32_t m); + + void WriteRdbah_1(uint32_t n, uint32_t m); + void WriteRdbah_2(uint32_t n, uint32_t m); + + void WriteRdlen_1(uint32_t n, uint32_t m); + void WriteRdlen_2(uint32_t n, uint32_t m); + + void WriteSrrctl_1(uint32_t n, uint32_t m); + void WriteSrrctlZero(uint32_t n); + void WriteSrrctl_1_desctype(uint32_t n, uint32_t m); + void WriteRscdbu(uint32_t m); + + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + void WriteRdt_2(uint32_t n, uint32_t m); + + void WriteIvarAlloc0(uint32_t n, uint32_t m); + void WriteIvarAllocval0(uint32_t n, uint32_t m); + void WriteIvarAlloc1(uint32_t n, uint32_t m); + void WriteIvarAllocval1(uint32_t n, uint32_t m); + void WriteIvarAlloc2(uint32_t n, uint32_t m); + void WriteIvarAllocval2(uint32_t n, uint32_t m); + void WriteIvarAlloc3(uint32_t n, uint32_t m); + void WriteIvarAllocval3(uint32_t n, uint32_t m); + + void WriteSecrxctrl_Rx_Dis(uint32_t m); + + void WriteTdbal(uint32_t n, uint32_t m); + void WriteTdbah(uint32_t n, uint32_t m); + void WriteTdlen(uint32_t n, uint32_t m); + + void WriteTdh(uint32_t n, uint32_t m); + void WriteTdt(uint32_t n, uint32_t m); + + void WriteTdwbal(uint32_t n, uint32_t m); + void WriteTdwbah(uint32_t n, uint32_t m); + + void WriteHlreg0(uint32_t m); + void WriteRdrxctl(uint32_t m); + void WriteRdrxctlRSCFRSTSIZE(uint32_t m); + + void WriteEiac(uint32_t m); + void WriteEimsn(uint32_t n, uint32_t m); + + void WriteRfctl(uint32_t m); + + void WriteRscctl(uint32_t n, uint32_t m); + void WritePsrtype(uint32_t n, uint32_t m); + + void WriteRxcsum(uint32_t m); + void WriteTxpbthresh(uint32_t n, uint32_t m); + void WriteMrqc(uint32_t m); + void WriteDtxmxszrq(uint32_t m); + void WriteMflcn(uint32_t m); + void WriteReta(uint32_t n, uint32_t m); + + void WritePsrtypeZero(uint32_t n); + + void WriteRttdcs(uint32_t m); + void WriteRttdcsArbdisEn(uint32_t m); + void WriteRxpbsize(uint32_t n, uint32_t m); + void WriteTxpbsize(uint32_t n, uint32_t m); + void WriteTxpbThresh(uint32_t n, uint32_t m); + void WriteMtqc(uint32_t m); + void WritePfvtctl(uint32_t m); + void WriteRtrup2tc(uint32_t m); + void WriteRttup2tc(uint32_t m); + void WritePfqde(uint32_t m); + void WriteRttdt1c(uint32_t m); + void WriteRttdt2c(uint32_t n, uint32_t m); + void WriteRttpt2c(uint32_t n, uint32_t m); + void WriteRtrpt4c(uint32_t n, uint32_t m); + void WriteRttpcs(uint32_t m); + void WriteRtrpcs(uint32_t m); + void WritePfvml2flt(uint32_t n, uint32_t m); + + void WriteMngtxmap(uint32_t m); + + void WriteRxfeccerr0(uint32_t m); + + uint8_t ReadRdrxctlDmaidone(); + + void ReadEicr(); + bool ReadStatusPcieMes(); + uint8_t ReadStatusLanId(); + void ReadCtrl(); + bool ReadEerdDone(); + uint16_t ReadEerdData(); + uint16_t ReadEeprom(uint16_t offset); + uint8_t ReadAnlp1(); + uint8_t ReadAutocRestartAn(); + uint8_t ReadEecAutoRd(); + uint32_t ReadEims(); + + uint32_t ReadRal(uint32_t n); + uint16_t ReadRah(uint32_t n); + uint8_t ReadRahAv(uint32_t n); + + uint8_t ReadRxdctl_1_enable(uint32_t n); + uint8_t ReadSecrxstat_Sr_Rdy(); + + uint8_t ReadTxdctl_enable(uint32_t n); + + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadTdh(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + + // some statistics + uint32_t ReadTpr(); + uint32_t ReadGprc(); + bool ReadLinksLinkUp(); + + // Process packet functions + void ProcessPacket(uint32_t n); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr); + void SendPacket(uint32_t n); + + e10k_queue_t& GetQueue() const { return *ixgq; } + + e10Kq& GetMultiQueue(size_t index) const { return *ixgmq[index]; } + + pci::Device& dev_; + pci::Bar& bar0_; + + struct IxgbeRegs { + volatile uint32_t kIxgbeCtrl; + volatile uint32_t kIxgbeCtrlBak; + volatile uint32_t kIxgbeStatus; + }; + + e10k_queue_t* ixgq; + uint8_t rcv_vector{0}; + + std::vector> ixgmq; + + friend class IxgbeDriverRep; +}; // class IxgbeDriver + +class IxgbeDriverRep : public MulticoreEbb { + public: + explicit IxgbeDriverRep(const IxgbeDriver& root); + void Run(); + void ReceivePoll(); + void ReclaimTx(); + void ReclaimRx(); + void Send(std::unique_ptr buf, PacketInfo pinfo); + void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, + enum l4_type l4type); + void AddTx(const uint8_t* pa, uint64_t len, uint64_t totallen, bool first, + bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum); + + private: + uint16_t ReadRdh_1(uint32_t n); + uint16_t ReadRdt_1(uint32_t n); + void WriteRdt_1(uint32_t n, uint32_t m); + void WriteRdh_1(uint32_t n, uint32_t m); + // uint16_t ReadRdt_1(uint32_t n); + // uint16_t ReadRdh_1(uint32_t n); + void WriteTdt_1(uint32_t n, uint32_t m); + void WriteEimcn(uint32_t n, uint32_t m); + uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, + bool* process_rsc, uint32_t* rnt); + + const IxgbeDriver& root_; + e10k_queue_t& ixgq_; + IxgbeDriver::e10Kq& ixgmq_; + + EventManager::IdleCallback receive_callback_; + +}; // class IxgbeDriverRep + +} // namespace ebbrt + +#endif // BAREMETAL_SRC_INCLUDE_EBBRT_IXGBE_DRIVER_H_ diff --git a/src/native/Main.cc b/src/native/Main.cc index 4e2f0145..382eaad6 100644 --- a/src/native/Main.cc +++ b/src/native/Main.cc @@ -47,7 +47,11 @@ #include "Trans.h" #include "VMem.h" #include "VMemAllocator.h" +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ +#include "IxgbeDriver.h" +#else #include "VirtioNet.h" +#endif namespace { bool started_once = false; @@ -146,18 +150,29 @@ ebbrt::Main(multiboot::Information* mbi) { Timer::Init(); smp::Init(); event_manager->ReceiveToken(); + #ifdef __EBBRT_ENABLE_NETWORKING__ NetworkManager::Init(); pci::Init(); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + pci::RegisterProbe(IxgbeDriver::Probe); +#else pci::RegisterProbe(VirtioNetDriver::Probe); +#endif + pci::LoadDrivers(); network_manager->StartDhcp().Then([](Future fut) { fut.Get(); // Dhcp completed #ifdef __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ +// Currently not supported in BMNIC since we don't pass arguments +// via grub +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ Messenger::Init(); runtime::Init(); #endif +#endif #endif // run global ctors for (unsigned i = 0; i < (end_ctors - start_ctors); ++i) { diff --git a/src/native/Msr.h b/src/native/Msr.h index 4e3b7ba6..9adc0699 100644 --- a/src/native/Msr.h +++ b/src/native/Msr.h @@ -30,6 +30,17 @@ inline uint64_t Read(uint32_t index) { inline void Write(uint32_t index, uint64_t data) { uint32_t low = data; uint32_t high = data >> 32; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - correct fix is here? + // GP fault happens when writing a 1 to bit #3 for kX2apicDcr, + // which is a reserved bit + // only happens in baremetal, VM prob virtualized this issue + if ((((data >> 2) & 0x1) == 1) && index == kX2apicDcr) { + low = (data & 0x3) | ((data & 0x4) << 1); + high = 0x0; + } +#endif asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); } } // namespace msr diff --git a/src/native/Net.cc b/src/native/Net.cc index 581cec54..d884b7e0 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -12,7 +12,8 @@ ebbrt::NetworkManager::NewInterface(EthernetDevice& ether_dev) { return *interface_; } -void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Drop packets that are too small @@ -26,7 +27,7 @@ void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf) { switch (ntohs(eth_header.type)) { case kEthTypeIp: { - ReceiveIp(eth_header, std::move(buf)); + ReceiveIp(eth_header, std::move(buf), rxflag); break; } case kEthTypeArp: { diff --git a/src/native/Net.h b/src/native/Net.h index 389bd1fb..3568f0cd 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -25,9 +25,16 @@ #include "RcuTable.h" #include "SharedPoolAllocator.h" +// IP and L4 checksum offload bits +#define RXFLAG_IPCS (1 << 0) +#define RXFLAG_IPCS_VALID (1 << 1) +#define RXFLAG_L4CS (1 << 2) +#define RXFLAG_L4CS_VALID (1 << 3) + namespace ebbrt { struct PacketInfo { static const constexpr uint8_t kNeedsCsum = 1; + static const constexpr uint8_t kNeedsIpCsum = 2; static const constexpr uint8_t kGsoNone = 0; static const constexpr uint8_t kGsoTcpv4 = 1; static const constexpr uint8_t kGsoUdp = 3; @@ -230,7 +237,7 @@ class NetworkManager : public StaticSharedEbb { explicit Interface(EthernetDevice& ether_dev) : address_(nullptr), ether_dev_(ether_dev) {} - void Receive(std::unique_ptr buf); + void Receive(std::unique_ptr buf, uint64_t rxflag = 0); void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()); void SendUdp(UdpPcb& pcb, Ipv4Address addr, uint16_t port, std::unique_ptr buf); @@ -260,11 +267,14 @@ class NetworkManager : public StaticSharedEbb { }; void ReceiveArp(EthernetHeader& eh, std::unique_ptr buf); - void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf); + void ReceiveIp(EthernetHeader& eh, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveIcmp(EthernetHeader& eh, Ipv4Header& ih, std::unique_ptr buf); - void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf); - void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf); + void ReceiveUdp(Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); + void ReceiveTcp(const Ipv4Header& ih, std::unique_ptr buf, + uint64_t rxflag = 0); void ReceiveDhcp(Ipv4Address from_addr, uint16_t from_port, std::unique_ptr buf); void EthArpSend(uint16_t proto, const Ipv4Header& ih, diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index e5c06153..6ecfde0d 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -19,9 +19,11 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( auto dp = buf->GetMutDataPointer(); auto& icmp_header = dp.Get(); - // checksum +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ + // software checksum if (IpCsum(*buf)) return; +#endif // if echo_request, send reply if (icmp_header.type == kIcmpEchoRequest) { @@ -43,9 +45,19 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( ip_header.ttl = kIpDefaultTtl; ip_header.chksum = 0; + + PacketInfo pinfo; + pinfo.flags = 0; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // hardware ip checksum offload + pinfo.flags |= PacketInfo::kNeedsIpCsum; +#else ip_header.chksum = ip_header.ComputeChecksum(); +#endif buf->Retreat(ip_header.HeaderLength()); - EthArpSend(kEthTypeIp, ip_header, std::move(buf)); + + EthArpSend(kEthTypeIp, ip_header, std::move(buf), pinfo); } } diff --git a/src/native/NetIp.cc b/src/native/NetIp.cc index d76ddce7..d320195b 100644 --- a/src/native/NetIp.cc +++ b/src/native/NetIp.cc @@ -36,8 +36,9 @@ bool ebbrt::NetworkManager::Interface::ItfAddress::isLocalNetwork( } // Receive an Ipv4 packet -void ebbrt::NetworkManager::Interface::ReceiveIp( - EthernetHeader& eth_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveIp(EthernetHeader& eth_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); if (unlikely(packet_len < sizeof(Ipv4Header))) @@ -59,8 +60,21 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( buf->TrimEnd(packet_len - tot_len); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal checksum offload + if (unlikely((rxflag & RXFLAG_IPCS) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS failed\n", __FUNCTION__); + return; + } + + if (unlikely((rxflag & RXFLAG_IPCS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_IPCS_VALID failed\n", __FUNCTION__); + return; + } +#else if (unlikely(ip_header.ComputeChecksum() != 0)) return; +#endif auto addr = Address(); // Unless the protocol is UDP or we have an address on this interface and the @@ -87,11 +101,11 @@ void ebbrt::NetworkManager::Interface::ReceiveIp( break; } case kIpProtoUDP: { - ReceiveUdp(ip_header, std::move(buf)); + ReceiveUdp(ip_header, std::move(buf), rxflag); break; } case kIpProtoTCP: { - ReceiveTcp(ip_header, std::move(buf)); + ReceiveTcp(ip_header, std::move(buf), rxflag); break; } } @@ -123,9 +137,14 @@ void ebbrt::NetworkManager::Interface::SendIp(std::unique_ptr buf, ih.chksum = 0; ih.src = src; ih.dst = dst; - ih.chksum = ih.ComputeChecksum(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // baremetal ip checksum offload + pinfo.flags |= PacketInfo::kNeedsIpCsum; +#else + ih.chksum = ih.ComputeChecksum(); kassert(ih.ComputeChecksum() == 0); +#endif pinfo.csum_start += sizeof(Ipv4Header); pinfo.hdr_len += sizeof(Ipv4Header); diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index e792f34a..c0f833fb 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -194,8 +194,9 @@ ebbrt::Ipv4Address ebbrt::NetworkManager::TcpPcb::GetRemoteAddress() { } // Receive a TCP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveTcp( - const Ipv4Header& ih, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveTcp(const Ipv4Header& ih, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -210,10 +211,21 @@ void ebbrt::NetworkManager::Interface::ReceiveTcp( if (unlikely(addr->isBroadcast(ih.dst) || ih.dst.isMulticast())) return; - // XXX: Check if rxcsum is supported - // if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) - // return; +// XXX: Check if rxcsum is supported +// if (unlikely(IpPseudoCsum(*buf, ih.proto, ih.src, ih.dst))) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto hdr_len = tcp_header.HdrLen(); if (unlikely(hdr_len < sizeof(TcpHeader) || hdr_len > packet_len)) return; diff --git a/src/native/NetUdp.cc b/src/native/NetUdp.cc index 7da5fdc4..992ee21f 100644 --- a/src/native/NetUdp.cc +++ b/src/native/NetUdp.cc @@ -57,8 +57,9 @@ void ebbrt::NetworkManager::UdpPcb::Receive( } // Receive UDP packet on an interface -void ebbrt::NetworkManager::Interface::ReceiveUdp( - Ipv4Header& ip_header, std::unique_ptr buf) { +void ebbrt::NetworkManager::Interface::ReceiveUdp(Ipv4Header& ip_header, + std::unique_ptr buf, + uint64_t rxflag) { auto packet_len = buf->ComputeChainDataLength(); // Ensure we have a header @@ -75,10 +76,20 @@ void ebbrt::NetworkManager::Interface::ReceiveUdp( // trim any excess off the packet buf->TrimEnd(packet_len - ntohs(udp_header.length)); - // XXX: Check if rxcsum supported - // if (udp_header.checksum && - // IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) - // return; +// XXX: Check if rxcsum supported +// if (udp_header.checksum && +// IpPseudoCsum(*buf, ip_header.proto, ip_header.src, ip_header.dst)) +// return; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (unlikely((rxflag & RXFLAG_L4CS) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS failed\n"); + return; + } + if (unlikely((rxflag & RXFLAG_L4CS_VALID) == 0)) { + ebbrt::kprintf("%s RXFLAG_L4CS_VALID failed\n"); + return; + } +#endif auto entry = network_manager->udp_pcbs_.find(ntohs(udp_header.dst_port)); diff --git a/src/native/Pci.cc b/src/native/Pci.cc index cdd53dae..2a740f7e 100644 --- a/src/native/Pci.cc +++ b/src/native/Pci.cc @@ -9,6 +9,7 @@ #include "../Align.h" #include "../ExplicitlyConstructed.h" #include "Debug.h" +#include "GeneralPurposeAllocator.h" #include "Io.h" #include "VMem.h" #include "VMemAllocator.h" @@ -34,7 +35,11 @@ uint8_t PciRead8(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { } uint16_t PciRead16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { PciSetAddr(bus, device, func, offset); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + return ebbrt::io::In16(kPciDataPort + (offset & 2)); +#else return ebbrt::io::In16(kPciDataPort); +#endif } uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { @@ -45,7 +50,12 @@ uint32_t PciRead32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset) { void PciWrite16(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, uint16_t val) { PciSetAddr(bus, device, func, offset); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + ebbrt::io::Out16(kPciDataPort + (offset & 2), val); +#else ebbrt::io::Out16(kPciDataPort, val); +#endif } void PciWrite32(uint8_t bus, uint8_t device, uint8_t func, uint8_t offset, @@ -71,8 +81,12 @@ void EnumerateBus(uint8_t bus) { if (dev) continue; + dev.DumpAddress(); + dev.DumpInfo(); + if (dev.IsBridge()) { - ebbrt::kabort("Secondary bus unsupported!\n"); + // ebbrt::kabort("Secondary bus unsupported!\n"); + continue; } else { devices->emplace_back(bus, device, func); } @@ -101,6 +115,11 @@ void ebbrt::pci::Init() { devices.construct(); driver_probes.construct(); EnumerateAllBuses(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // TODO - Kludge to identify where NIC sits in device tree, should incorporate + // Dan's pull request for enumerating bridges + EnumerateBus(0x1); +#endif } void ebbrt::pci::RegisterProbe(std::function probe) { @@ -149,6 +168,18 @@ uint16_t ebbrt::pci::Function::GetCommand() const { return Read16(kCommandAddr); } +uint8_t ebbrt::pci::Function::GetClassCode() const { + return Read8(kClassCodeAddr); +} + +uint8_t ebbrt::pci::Function::GetFunc() const { return func_; } + +uint8_t ebbrt::pci::Function::GetSubclass() const { + return Read8(kSubclassAddr); +} + +uint8_t ebbrt::pci::Function::GetProgIf() const { return Read8(kProgIfAddr); } + uint8_t ebbrt::pci::Function::GetHeaderType() const { return Read8(kHeaderTypeAddr) & ~kHeaderMultifuncMask; } @@ -187,6 +218,11 @@ void ebbrt::pci::Function::DumpAddress() const { kprintf("%u:%u:%u\n", bus_, device_, func_); } +void ebbrt::pci::Function::DumpInfo() const { + kprintf("Vendor ID: 0x%x ", GetVendorId()); + kprintf("Device ID: 0x%x\n", GetDeviceId()); +} + ebbrt::pci::Bar::Bar(pci::Device& dev, uint32_t bar_val, uint8_t idx) : vaddr_(nullptr), is_64_(false), prefetchable_(false) { mmio_ = !(bar_val & kIoSpaceFlag); @@ -226,6 +262,8 @@ ebbrt::pci::Bar::~Bar() { kbugon(vaddr_ != nullptr, "pci::Bar: Need to free mapped region\n"); } +void* ebbrt::pci::Bar::GetVaddr() { return vaddr_; } + bool ebbrt::pci::Bar::Is64() const { return is_64_; } void ebbrt::pci::Bar::Map() { @@ -233,10 +271,21 @@ void ebbrt::pci::Bar::Map() { return; auto npages = align::Up(size_, pmem::kPageSize) >> pmem::kPageShift; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + auto pf = std::make_unique(); + auto& ref = *pf; + auto page = vmem_allocator->Alloc(npages, std::move(pf)); + vaddr_ = reinterpret_cast(page.ToAddr()); + kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); + vmem::MapMemory(page, Pfn::Down(addr_), size_); + ref.SetMap(page, Pfn::Down(addr_), size_); +#else auto page = vmem_allocator->Alloc(npages); vaddr_ = reinterpret_cast(page.ToAddr()); kbugon(page == Pfn::None(), "Failed to allocate virtual pages for mmio\n"); vmem::MapMemory(page, Pfn::Down(addr_), size_); +#endif } uint8_t ebbrt::pci::Bar::Read8(size_t offset) { @@ -415,7 +464,15 @@ void ebbrt::pci::Device::SetMsixEntry(size_t entry, uint8_t vector, uint8_t dest) { auto& msix_bar = GetBar(msix_bar_idx_); auto offset = msix_table_offset_ + entry * kMsixTableEntrySize; + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + // more precise + msix_bar.Write32(offset + kMsixTableEntryAddrLow, 0xFEE00000 | dest << 12); + msix_bar.Write32(offset + kMsixTableEntryAddrHigh, 0x0); +#else msix_bar.Write32(offset + kMsixTableEntryAddr, 0xFEE00000 | dest << 12); +#endif + msix_bar.Write32(offset + kMsixTableEntryData, vector); MsixUnmaskEntry(entry); } diff --git a/src/native/Pci.h b/src/native/Pci.h index 15bc1dce..3d40af31 100644 --- a/src/native/Pci.h +++ b/src/native/Pci.h @@ -30,6 +30,7 @@ class Function { uint8_t GetLatencyTimer() const; uint8_t GetHeaderType() const; uint8_t GetBist() const; + uint8_t GetFunc() const; operator bool() const; bool IsMultifunc() const; @@ -40,6 +41,7 @@ class Function { void DisableInt(); void DumpAddress() const; + void DumpInfo() const; protected: static const constexpr uint8_t kVendorIdAddr = 0x00; @@ -87,6 +89,7 @@ class Bar { void Write8(size_t offset, uint8_t val); void Write16(size_t offset, uint16_t val); void Write32(size_t offset, uint32_t val); + void* GetVaddr(); private: static const constexpr uint32_t kIoSpaceFlag = 0x1; @@ -166,6 +169,8 @@ class Device : public Function { static const constexpr size_t kMsixTableEntryAddr = 0; static const constexpr size_t kMsixTableEntryData = 8; static const constexpr size_t kMsixTableEntryControl = 12; + static const constexpr size_t kMsixTableEntryAddrLow = 0; + static const constexpr size_t kMsixTableEntryAddrHigh = 4; static const constexpr uint32_t kMsixTableEntryControlMaskBit = 1; diff --git a/src/native/config.cmake b/src/native/config.cmake index f3831979..6e51122c 100644 --- a/src/native/config.cmake +++ b/src/native/config.cmake @@ -1,6 +1,7 @@ # EbbRT native platform-specific configuration option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" ON) option(__EBBRT_ENABLE_NETWORKING__ "Enable Networking" ON) +option(__EBBRT_ENABLE_BAREMETAL_NIC__ "Enable Baremetal NIC" OFF) option(__EBBRT_ENABLE_TRACE__ "Enable Tracing Subsystem" OFF) option(LARGE_WINDOW_HACK "Enable Large TCP Window Hack" OFF) option(PAGE_CHECKER "Enable Page Checker" OFF) diff --git a/src/native/config.h.in b/src/native/config.h.in index 5344cd86..f90587bf 100644 --- a/src/native/config.h.in +++ b/src/native/config.h.in @@ -1,6 +1,7 @@ #cmakedefine __EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ #cmakedefine __EBBRT_ENABLE_NETWORKING__ #cmakedefine __EBBRT_ENABLE_TRACE__ +#cmakedefine __EBBRT_ENABLE_BAREMETAL_NIC__ #cmakedefine LARGE_WINDOW_HACK #cmakedefine PAGE_CHECKER #cmakedefine VIRTIO_ZERO_COPY From f837e9de07658ec82792a8e7806e0b40d3578509 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 14:10:07 -0400 Subject: [PATCH 03/20] Add checksum offload for ixgbe --- src/native/NetChecksum.cc | 5 +++++ src/native/NetChecksum.h | 2 ++ src/native/NetTcp.cc | 5 ++++- 3 files changed, 11 insertions(+), 1 deletion(-) diff --git a/src/native/NetChecksum.cc b/src/native/NetChecksum.cc index f92e5bfc..bfd60c09 100644 --- a/src/native/NetChecksum.cc +++ b/src/native/NetChecksum.cc @@ -156,6 +156,11 @@ uint16_t ebbrt::OffloadPseudoCsum(const IOBuf& buf, uint8_t proto, return From32To16(PseudoCsum(buf.ComputeChainDataLength(), proto, src, dst)); } +uint16_t ebbrt::OffloadPseudoCsumTso(uint8_t proto, + Ipv4Address src, Ipv4Address dst) { + return From32To16(PseudoCsum(0, proto, src, dst)); +} + // Calculate the Ipv4 pseudo checksum with the provided header information uint16_t ebbrt::IpPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst) { diff --git a/src/native/NetChecksum.h b/src/native/NetChecksum.h index 8d875f95..9274c8ee 100644 --- a/src/native/NetChecksum.h +++ b/src/native/NetChecksum.h @@ -14,6 +14,8 @@ namespace ebbrt { uint16_t OffloadPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst); +uint16_t OffloadPseudoCsumTso(uint8_t proto, Ipv4Address src, + Ipv4Address dst); uint16_t IpPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst); uint16_t IpCsum(const IOBuf& buf); diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index c0f833fb..08a99893 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -712,7 +712,6 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( if (TcpSeqGT(rcv_nxt, info.seqno)) { // Trim the front - kprintf(">> rcv_nxt > info.seqno \n"); buf->Advance(rcv_nxt - info.seqno); info.tcplen -= rcv_nxt - info.seqno; } @@ -1085,6 +1084,10 @@ void ebbrt::NetworkManager::TcpEntry::SendSegment(TcpSegment& segment) { pinfo.gso_type = PacketInfo::kGsoTcpv4; pinfo.hdr_len = segment.th.HdrLen(); pinfo.gso_size = mss; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + segment.th.checksum = + OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); +#endif } network_manager->SendIp(CreateRefChain(*(segment.buf)), address, From 5dee04920c080d04cee7fe383acc22a7a44a6242 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 14:19:02 -0400 Subject: [PATCH 04/20] fixed an allocation error in IOBuf, seems to be baremetal only --- src/IOBuf.h | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/src/IOBuf.h b/src/IOBuf.h index c430f1a3..0804e3d1 100644 --- a/src/IOBuf.h +++ b/src/IOBuf.h @@ -320,11 +320,34 @@ class MutIOBuf : public IOBuf { template T& GetNoAdvance() { assert(p_->Length() > 0); + +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (p_->Length() - offset_ < sizeof(T)) { + // request straddles buffers, allocate a new chunk of memory to copy it + // into (so it is contiguous) + chunk_list.emplace_front(); + auto& chunk = chunk_list.front(); + chunk.reserve(sizeof(T)); + auto p = p_; + auto len = sizeof(T); + auto offset = offset_; + while (len > 0) { + auto remainder = std::min(p->Length() - offset, len); + auto data = p->Data() + offset; + chunk.insert(chunk.end(), data, data + remainder); + p = p->Next(); + offset = 0; + len -= remainder; + } + return *reinterpret_cast(Data()); + } +#else + if (p_->Length() - offset_ < sizeof(T)) { throw std::runtime_error( "MutDataPointer::Get(): request straddles buffer"); } - +#endif return *reinterpret_cast(Data()); } @@ -355,6 +378,7 @@ class MutIOBuf : public IOBuf { private: MutIOBuf* p_{nullptr}; size_t offset_{0}; + std::forward_list> chunk_list; }; MutDataPointer GetMutDataPointer() { return MutDataPointer(this); } From 032175f365cf3212a523178bc4c578b8af63d798 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 15:04:34 -0400 Subject: [PATCH 05/20] updated IxgbeDrver for TSO and code refactor --- src/native/IxgbeDriver.cc | 208 ++++++++++++++++++++------------------ src/native/IxgbeDriver.h | 38 ++++--- 2 files changed, 136 insertions(+), 110 deletions(-) diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index ab28293b..4efc4fb2 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -99,26 +99,31 @@ void ebbrt::IxgbeDriverRep::AddContext(uint8_t idx, uint8_t maclen, ixgmq_.tx_isctx_[tail] = true; // refer to 82599 datasheet for these settings + actx->iplen = iplen; + actx->maclen = maclen; + actx->dytp = 0b0010; actx->dext = 1; actx->idx = idx; - actx->maclen = maclen; - actx->iplen = iplen; actx->ipv4 = 1; - actx->l4len = 0; // ignored when TSE not set + //actx->l4len = 0; // ignored when TSE not set + actx->l4len = 0x14; // TSE for TCP is 20 actx->l4t = l4type; + actx->mss = 0x5b4; // MSS - 1460 + //actx->mss = 0x5a8; // MSS - 1448 + // need to increment tail ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; } // Add a new packet to be transmitted -void ebbrt::IxgbeDriverRep::AddTx(const uint8_t* pa, uint64_t len, +void ebbrt::IxgbeDriverRep::AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, bool last, uint8_t ctx, bool ip_cksum, - bool tcpudp_cksum) { + bool tcpudp_cksum, bool tse, int hdr_len) { tdesc_advance_tx_rf_t* actx; auto tail = ixgmq_.tx_tail_; @@ -131,37 +136,40 @@ void ebbrt::IxgbeDriverRep::AddTx(const uint8_t* pa, uint64_t len, // pa is physical address of where send buffer exists actx->address = reinterpret_cast(pa); + actx->dtalen = len; if (first) { - actx->paylen = totallen; + if(tse) { + actx->paylen = totallen - hdr_len; + } else { + actx->paylen = totallen; + } + + // checksum + actx->ifcs = 1; + + // tcp segmentation offload + if(tse) { + actx->tse = 1; + } } // type is advanced actx->dtyp = 0b0011; actx->dext = 1; - // rs bit should only be set when eop is set - if (last) { - actx->rs = 1; - } else { - actx->rs = 0; - } - - // checksum - actx->ifcs = 1; - // set last packet bit if (last) { actx->eop = 1; + // rs bit should only be set when eop is set + actx->rs = 1; } else { actx->eop = 0; } - // TODO enable ip checksum - if (ctx != -1) { + if (ctx != -1 && first) { actx->idx = ctx; - actx->cc = 1; - actx->ixsm = ip_cksum; // no ip checksum + actx->ixsm = ip_cksum; // ip checksum offload actx->txsm = tcpudp_cksum; // udp or tcp checksum offload } @@ -170,20 +178,18 @@ void ebbrt::IxgbeDriverRep::AddTx(const uint8_t* pa, uint64_t len, } void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { - auto dp = buf->GetDataPointer(); - auto len = buf->ComputeChainDataLength(); - auto count = buf->CountChainElements(); bool ip_cksum = false; bool tcpudp_cksum = false; - - ebbrt::kbugon(len >= 0xA0 * 1000, - "%s packet len bigger than max ether length\n", __FUNCTION__); - + uint64_t data; + size_t len, count; + int mcore = (int)Cpu::GetMine(); + // TODO threshold for triggering reclaim tx buffers #ifndef TX_HEAD_WB size_t free_desc = IxgbeDriver::NTXDESCS - (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); + count = buf->CountChainElements(); // free descripts must have enough for count in chained iobufs if (free_desc < (count + 1)) { // reclaim buffers @@ -201,81 +207,90 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { if (pinfo.flags & PacketInfo::kNeedsIpCsum) { ip_cksum = true; } - - // NEED CHECKSUM + if (pinfo.flags & PacketInfo::kNeedsCsum) { tcpudp_cksum = true; - - // check datasheet for numbers - if (pinfo.csum_offset == 6) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); - } else if (pinfo.csum_offset == 16) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); - } else { - ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } + + // buffers are chained + if(buf->IsChained()) { + len = buf->ComputeChainDataLength(); + count = buf->CountChainElements(); + + if(tcpudp_cksum) { + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + } else if (pinfo.csum_offset == 16) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); + } else { + ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } } - // if buffer is chained - if (buf->IsChained()) { - size_t counter = 0; + // 7.2.1.1 + // A packet (or multiple packets in transmit segmentation) can span + // any number of buffers (and their descriptors) up to a limit of 40 minus WTHRESH minus 2 + if(count > 38) { + //ebbrt::kprintf_force("count = %d\n", count); + std::unique_ptr b; + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); for (auto& buf_it : *buf) { - counter++; - - // first buffer - if (counter == 1) { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, - true, false, 0, ip_cksum, tcpudp_cksum); - } else { - // last buffer - if (counter == count) { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), - len, false, true, 0, ip_cksum, tcpudp_cksum); - } else { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), - len, false, false, 0, ip_cksum, tcpudp_cksum); - } - } + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); } - } - // not chained - else { - AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); - } - } else { - // NO CHECKSUM FLAG SET - // if buffer is chained - if (buf->IsChained()) { + data = reinterpret_cast(b->MutData()); + AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); + + } else { size_t counter = 0; for (auto& buf_it : *buf) { - counter++; - - // first buffer - if (counter == 1) { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), len, - true, false, 0, ip_cksum, tcpudp_cksum); - } else { - // last buffer - if (counter == count) { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), - len, false, true, 0, ip_cksum, tcpudp_cksum); - } else { - AddTx(buf_it.Data(), reinterpret_cast(buf_it.Length()), - len, false, false, 0, ip_cksum, tcpudp_cksum); - } - } + counter++; + + uint64_t dlen = reinterpret_cast(buf_it.Length()); + uint64_t daddr = reinterpret_cast(buf_it.Data()); + + // first buffer + if (counter == 1) { + AddTx(daddr, dlen, len, true, false, 0, ip_cksum, tcpudp_cksum, + len > 1514, static_cast(pinfo.hdr_len)); + //last buffer + } else if (counter == count ) { + AddTx(daddr, dlen, len, false, true, 0, ip_cksum, tcpudp_cksum, + len > 1514, static_cast(pinfo.hdr_len)); + } else { + AddTx(daddr, dlen, len, false, false, 0, ip_cksum, tcpudp_cksum, + len > 1514, static_cast(pinfo.hdr_len)); + } } } - // not chained - else { - AddTx(buf->Data(), len, len, true, true, 0, ip_cksum, tcpudp_cksum); + } else { // buffers NOT chained + data = reinterpret_cast(buf->Data()); + len = buf->ComputeChainDataLength(); + + if(tcpudp_cksum) { + // check datasheet for numbers + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + } else if (pinfo.csum_offset == 16) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); + } else { + ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } + + AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); + } else { + AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); } } - + // bump tx_tail // indicates position beyond last descriptor hw - WriteTdt_1(Cpu::GetMine(), ixgmq_.tx_tail_); + size_t tail = ixgmq_.tx_tail_; + WriteTdt_1(mcore, tail); } + void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { // Disable RXCTRL - 8.2.3.8.10 bar0_.Write32(0x03000, m); @@ -1342,7 +1357,8 @@ void ebbrt::IxgbeDriver::Init() { #ifdef RSC_EN // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us // Recommended value based on 7.3.2.1.1 - WriteGpie(0x1 << 11); + WriteGpie(IxgbeDriver::RSC_DELAY << 11); + ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d\n", (IxgbeDriver::RSC_DELAY + 1) * 4); #endif /* FreeBSD: @@ -1452,7 +1468,7 @@ void ebbrt::IxgbeDriver::Init() { * Clear RTTDCS.ARBDIS to 0b. */ WriteRttdcs(0x1 << 6); - WriteDtxmxszrq(0xFFF); + WriteDtxmxszrq(MAX_BYTES_NUM_REQ); WriteTxpbsize(0, 0xA0 << 10); WriteTxpbThresh(0, 0xA0); for (auto i = 1; i < 8; i++) { @@ -1581,7 +1597,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // must be greater than rsc delay // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us - WriteEitr(i, 0x7 << 3); // 16 * 2us = 32 us + WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3)); // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. // The hardware setting for interrupts 16...63 is always auto clear. @@ -1835,7 +1851,6 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, #else // no RSC so just get one packet at a time - int c = static_cast(Cpu::GetMine()); rdesc_legacy_t tmp; tmp = ixgmq_.rx_ring_[ixgmq_.rx_head_]; @@ -1880,10 +1895,8 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { bool process_rsc; uint32_t count; uint32_t rnt; - static bool ret = false; process_rsc = false; -retry: rxflag = 0; count = 0; rnt = 0; @@ -1957,11 +1970,6 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // update reg WriteRdt_1(Cpu::GetMine(), ixgmq_.rx_tail_); } - - // keep looping back once we see start of rsc context - if (likely(ret)) { - goto retry; - } } ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) @@ -1999,3 +2007,9 @@ void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); } + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriverRep::WriteEimc(uint32_t m) { root_.bar0_.Write32(0x00888, m); } + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +void ebbrt::IxgbeDriverRep::WriteEims(uint32_t m) { root_.bar0_.Write32(0x00880, m); } diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 46670a2d..16be5164 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -20,10 +20,10 @@ #include "SlabAllocator.h" // Receive Side Scaling (RSC) enabled -#define RSC_EN +//#define RSC_EN // Direct Cache Access (DCA) enabled -#define DCA_ENABLE -// Transmit Header Writeback enabled +//#define DCA_ENABLE +//// Transmit Header Writeback enabled #define TX_HEAD_WB namespace ebbrt { @@ -79,7 +79,7 @@ class IxgbeDriver : public EthernetDevice { protected: static const constexpr uint16_t kIxgbeVendorId = 0x8086; - static const constexpr uint16_t kIxgbeDeviceId = 0x10F8; // 0x10FB; + static const constexpr uint16_t kIxgbeDeviceId = 0x10FB; /* FreeBSD: * RxDescriptors Valid Range: 64-4096 Default Value: 256 This value is the @@ -91,12 +91,24 @@ class IxgbeDriver : public EthernetDevice { * against the system mbuf pool limit, you can tune nmbclusters * to adjust for this. */ - static const constexpr uint32_t NTXDESCS = 256; - static const constexpr uint32_t NRXDESCS = 256; - // static const constexpr uint32_t NTXDESCS = 4096; - // static const constexpr uint32_t NRXDESCS = 4096; - static const constexpr uint32_t RXBUFSZ = 4096; - // static const constexpr uint32_t RXBUFSZ = 16384; + // Linux Defaults + static const constexpr uint32_t NTXDESCS = 512; + static const constexpr uint32_t NRXDESCS = 512; + static const constexpr uint32_t RXBUFSZ = 2048; + static const constexpr uint32_t BSIZEHEADER = 256; + + //static const constexpr uint32_t NTXDESCS = 8192; + //static const constexpr uint32_t NRXDESCS = 8192; + //static const constexpr uint32_t RXBUFSZ = 4096; + //static const constexpr uint32_t RXBUFSZ = 16384; + + static const constexpr uint8_t ITR_INTERVAL = 6; + // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us + static const constexpr uint8_t RSC_DELAY = 1; + + // DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ + static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0x10; + //static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0xFFF; // Class with per core queue data structures class e10Kq { @@ -446,17 +458,17 @@ class IxgbeDriverRep : public MulticoreEbb { void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, enum l4_type l4type); void AddTx(const uint8_t* pa, uint64_t len, uint64_t totallen, bool first, - bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum); + bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); private: uint16_t ReadRdh_1(uint32_t n); uint16_t ReadRdt_1(uint32_t n); void WriteRdt_1(uint32_t n, uint32_t m); void WriteRdh_1(uint32_t n, uint32_t m); - // uint16_t ReadRdt_1(uint32_t n); - // uint16_t ReadRdh_1(uint32_t n); void WriteTdt_1(uint32_t n, uint32_t m); void WriteEimcn(uint32_t n, uint32_t m); + void WriteEimc(uint32_t m); + void WriteEims(uint32_t m); uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, bool* process_rsc, uint32_t* rnt); From f32cd8afc43a5c3e5e9fecd85fa4eefad9177747 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 15:08:18 -0400 Subject: [PATCH 06/20] updated NetDhcp --- src/native/NetDhcp.cc | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/native/NetDhcp.cc b/src/native/NetDhcp.cc index cc931ede..d5ce53eb 100644 --- a/src/native/NetDhcp.cc +++ b/src/native/NetDhcp.cc @@ -13,6 +13,7 @@ ebbrt::Future ebbrt::NetworkManager::StartDhcp() { kbugon(Cpu::GetMine() != 0, "Dhcp not started on core 0!"); +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // Before DHCP, check if a static IP has been specified auto cmdline = std::string(ebbrt::multiboot::CmdLine()); auto loc = cmdline.find("nodhcp"); @@ -20,6 +21,7 @@ ebbrt::Future ebbrt::NetworkManager::StartDhcp() { kprintf("Warning: Skipping DHCP, static IP detected\n"); return MakeReadyFuture(); } +#endif if (interface_) return interface_->StartDhcp(); @@ -194,10 +196,12 @@ void ebbrt::NetworkManager::Interface::DhcpHandleAck( kassert(netmask_opt); addr->netmask = *netmask_opt; +#ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // assert fails in baremetal auto gw_opt = DhcpGetOptionLong(message, kDhcpOptionRouter); kassert(gw_opt); addr->gateway = *gw_opt; - +#endif + SetAddress(std::unique_ptr(addr)); DhcpSetState(DhcpPcb::State::kBound); From 86b0252529de0f4e05430bbe3988ad77fe69fd90 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 15:20:20 -0400 Subject: [PATCH 07/20] fixed NetTcp.cc baremetal bug where a segfault on double delete was happening when user closes a connection twice. --- src/native/NetTcp.cc | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index 08a99893..b9c304ea 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -363,6 +363,7 @@ void ebbrt::NetworkManager::TcpEntry::Destroy() { std::lock_guard guard(network_manager->tcp_write_lock_); network_manager->tcp_pcbs_.erase(*this); } + kassert(this); event_manager->DoRcu([this]() { delete this; }); } @@ -719,6 +720,9 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // Second check the RST bit if (unlikely(flags & kTcpRst)) { state = kClosed; +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + bool flag = false; +#endif if (state == kSynReceived) { // RFC 793 Page 70: // "If this connection was initiated with a passive OPEN (i.e., came @@ -729,7 +733,6 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // In either case, all segments on the retransmission queue should be // removed. And in the active OPEN case, enter the CLOSED state and // delete the TCB, and return." - handler->Abort(); } else if (state >= kEstablished && state <= kCloseWait) { // If the RST bit is set then, any outstanding RECEIVEs and SEND @@ -745,10 +748,17 @@ bool ebbrt::NetworkManager::TcpEntry::Receive( // RFC 793 Page 70: // If the RST bit is set then, enter the CLOSED state, delete the TCB, // and return. +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + flag = true; +#endif } Purge(); DisableTimers(); - Destroy(); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + if (flag == false) Destroy(); +#else + Destroy(); +#endif return false; } else if (unlikely(flags & kTcpSyn)) { // RFC 793 Page 71: From 8a5d24911317986e566480579a54c9bfb5e67be3 Mon Sep 17 00:00:00 2001 From: Han Date: Sat, 6 Apr 2019 17:07:15 -0400 Subject: [PATCH 08/20] misc compile and runtime fixes --- src/native/Debug.cc | 3 +++ src/native/IxgbeDriver.cc | 1 - src/native/IxgbeDriver.h | 2 +- src/native/Pci.cc | 2 +- src/native/config.cmake | 6 +++--- 5 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/native/Debug.cc b/src/native/Debug.cc index 1f45a06d..fc769c2a 100644 --- a/src/native/Debug.cc +++ b/src/native/Debug.cc @@ -12,6 +12,9 @@ void ebbrt::kvprintf(const char* __restrict format, va_list va) { auto len = vsnprintf(nullptr, 0, format, va); char buffer[len + 1]; // NOLINT vsnprintf(buffer, len + 1, format, va2); +#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + console::Write("\r"); +#endif console::Write(buffer); } diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 4efc4fb2..81b3f283 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -1905,7 +1905,6 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt) == 0) { // hit last rsc context, start to process all buffers if (process_rsc) { - ret = true; process_rsc = false; count++; diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 16be5164..1f142988 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -457,7 +457,7 @@ class IxgbeDriverRep : public MulticoreEbb { void Send(std::unique_ptr buf, PacketInfo pinfo); void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, enum l4_type l4type); - void AddTx(const uint8_t* pa, uint64_t len, uint64_t totallen, bool first, + void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); private: diff --git a/src/native/Pci.cc b/src/native/Pci.cc index 2a740f7e..83a4674e 100644 --- a/src/native/Pci.cc +++ b/src/native/Pci.cc @@ -118,7 +118,7 @@ void ebbrt::pci::Init() { #ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // TODO - Kludge to identify where NIC sits in device tree, should incorporate // Dan's pull request for enumerating bridges - EnumerateBus(0x1); + EnumerateBus(0x4); #endif } diff --git a/src/native/config.cmake b/src/native/config.cmake index 6e51122c..7c477114 100644 --- a/src/native/config.cmake +++ b/src/native/config.cmake @@ -1,7 +1,7 @@ # EbbRT native platform-specific configuration -option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" ON) +option(__EBBRT_ENABLE_DISTRIBUTED_RUNTIME__ "Enable Distributed Runtime Support" OFF) option(__EBBRT_ENABLE_NETWORKING__ "Enable Networking" ON) -option(__EBBRT_ENABLE_BAREMETAL_NIC__ "Enable Baremetal NIC" OFF) +option(__EBBRT_ENABLE_BAREMETAL_NIC__ "Enable Baremetal NIC" ON) option(__EBBRT_ENABLE_TRACE__ "Enable Tracing Subsystem" OFF) option(LARGE_WINDOW_HACK "Enable Large TCP Window Hack" OFF) option(PAGE_CHECKER "Enable Page Checker" OFF) @@ -13,7 +13,7 @@ configure_file(${PLATFORM_SOURCE_DIR}/config.h.in config.h @ONLY) set(CMAKE_CXX_FLAGS "-Wall -Werror -std=gnu++14 -include ${CMAKE_CURRENT_BINARY_DIR}/config.h") set(CMAKE_CXX_FLAGS_DEBUG "-O0 -g3") set(CMAKE_CXX_FLAGS_MINSIZEREL "-Os -DNDEBUG") -set(CMAKE_CXX_FLAGS_RELEASE "-O4 -flto -DNDEBUG") +set(CMAKE_CXX_FLAGS_RELEASE "-O4 -flto") set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O2 -g3") set(CMAKE_SHARED_LIBRARY_LINK_C_FLAGS) set(CMAKE_ASM_FLAGS "-DASSEMBLY") From c80dd47c12ae4f63b76ddbbe07013d6dbf33866e Mon Sep 17 00:00:00 2001 From: Han Date: Mon, 8 Apr 2019 13:59:34 -0400 Subject: [PATCH 09/20] Initialized RSS to distribute packets evenly to multiple cores, added perf and statistic counters to IxgbeDriver, hijacked Ping messages to act as triggers to set up state --- src/native/Ixgbe.h | 2 + src/native/IxgbeDriver.cc | 123 +++++++++++++++++++++++++++++++++++--- src/native/IxgbeDriver.h | 28 ++++++++- src/native/Net.h | 1 + src/native/NetIcmp.cc | 3 +- 5 files changed, 145 insertions(+), 12 deletions(-) diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h index 1a966ec1..b2ed523b 100644 --- a/src/native/Ixgbe.h +++ b/src/native/Ixgbe.h @@ -34,6 +34,8 @@ #define IXGBE_MFLCN_RPFCE 0x00000004 /* Receive Priority FC Enable */ #define IXGBE_MFLCN_RFCE 0x00000008 /* Receive FC Enable */ +#define IXGBE_EITR_CNT_WDIS 0x80000000 + enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; #define ETHHDR_LEN 14 diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 81b3f283..76320fb6 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -28,6 +28,11 @@ void ebbrt::IxgbeDriver::Create(pci::Device& dev) { ixgbe_dev->ebb_ = IxgbeDriverRep::Create(ixgbe_dev, ebb_allocator->AllocateLocal()); + // only even core numbers + if(static_cast(Cpu::Count()) > 1) { + kassert(static_cast(Cpu::Count()) % 2 == 0); + } + // initialize per core rx and tx queues for (size_t i = 0; i < Cpu::Count(); i++) { ixgbe_dev->SetupMultiQueue(i); @@ -44,7 +49,47 @@ const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { return mac_addr_; } +void ebbrt::IxgbeDriver::DumpStats() { + for (size_t i = 0; i < Cpu::Count(); i++) { + ebbrt::kprintf("Core %d STATS:\n", (int)i); + ebbrt::kprintf("\t num_recv_itrs:%lld\n", ixgmq[i]->stat_num_itr); + ebbrt::kprintf("\t num_send:%lld\n", ixgmq[i]->stat_num_send); + ebbrt::kprintf("\t num_rx_desc_proc:%lld\n", ixgmq[i]->stat_num_rx); + ebbrt::kprintf("\t num_tx_desc_proc:%lld\n", ixgmq[i]->stat_num_tx); + + // reset to 0 + ixgmq[i]->stat_num_itr = 0; + ixgmq[i]->stat_num_send = 0; + ixgmq[i]->stat_num_rx = 0; + ixgmq[i]->stat_num_tx = 0; + + if(ixgmq[i]->stat_perf == false) { + ixgmq[i]->perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + ixgmq[i]->perfCycles.Start(); + ixgmq[i]->perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + ixgmq[i]->perfInst.Start(); + ixgmq[i]->stat_perf =true; + } else { + ixgmq[i]->perfCycles.Stop(); + ixgmq[i]->perfInst.Stop(); + double cyc = static_cast(ixgmq[i]->perfCycles.Read()); + double inst = static_cast(ixgmq[i]->perfInst.Read()); + + ebbrt::kprintf("Core %d PMC:\n", (int)i); + ebbrt::kprintf("\t cycles:%llf \n", cyc); + ebbrt::kprintf("\t instructions:%llf\n", inst); + ebbrt::kprintf("\t ipc: %llf\n", inst/cyc); + ixgmq[i]->stat_perf = false; + } + } +} + void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { +#ifdef STATS_EN + if(pinfo.get_stats) { + DumpStats(); + } +#endif ebb_->Send(std::move(buf), std::move(pinfo)); } @@ -175,6 +220,9 @@ void ebbrt::IxgbeDriverRep::AddTx(uint64_t pa, uint64_t len, ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; +#ifdef STATS_EN + ixgmq_.stat_num_tx ++; +#endif } void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { @@ -182,7 +230,11 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { bool tcpudp_cksum = false; uint64_t data; size_t len, count; - int mcore = (int)Cpu::GetMine(); + uint32_t mcore = static_cast(Cpu::GetMine()); + +#ifdef STATS_EN + ixgmq_.stat_num_send ++; +#endif // TODO threshold for triggering reclaim tx buffers #ifndef TX_HEAD_WB @@ -765,6 +817,7 @@ void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { // 4*n, n=0...31; RW) void ebbrt::IxgbeDriver::WriteReta(uint32_t n, uint32_t m) { bar0_.Write32(0x0EB00 + 4 * n, m); + ebbrt::kprintf("WriteReta(n=%d) = 0x%08X\n", n, m); } // 8.2.3.7.6 Receive Filter Control Register — RFCTL (0x05008; RW) @@ -1276,6 +1329,7 @@ void ebbrt::IxgbeDriver::GlobalReset() { **/ void ebbrt::IxgbeDriver::Init() { uint64_t d_mac; + uint32_t ncore = static_cast(Cpu::Count()); ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); bar0_.Map(); // allocate virtual memory @@ -1419,6 +1473,10 @@ void ebbrt::IxgbeDriver::Init() { #ifndef RSC_EN WriteRxcsum(0x1 << 12); // IP payload checksum enable +#else + // note: PCSD: The Fragment Checksum and IP Identification fields are mutually exclusive with + // the RSS hash. Only one of the two options is reported in the Rx descriptor. + WriteRxcsum(0x2000); #endif // TODO RQTC @@ -1432,10 +1490,48 @@ void ebbrt::IxgbeDriver::Init() { WriteMpsar(i, 0x0); } - // TODO RSSRK - - for (auto i = 0; i < 32; i++) { - WriteReta(i, 0x0); + // RSSRK - random seeds taken from Linux + WriteRssrk(0, 0xA38DD80F); + WriteRssrk(1, 0xD107C3DC); + WriteRssrk(2, 0x8CEB19C4); + WriteRssrk(3, 0xA41E1B6B); + WriteRssrk(4, 0xB7218638); + WriteRssrk(5, 0x6B8B6155); + WriteRssrk(6, 0xDC8D08B5); + WriteRssrk(7, 0xD2E8684B); + WriteRssrk(8, 0xECEF8417); + WriteRssrk(9, 0xE56C84D5); + + // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to + // hacky atm + for (auto i = 0; i < 32; i += 4) { + // all route to core 0 + if(ncore == 1) { + WriteReta(i, 0x0000000); + WriteReta(i+1, 0x0000000); + WriteReta(i+2, 0x0000000); + WriteReta(i+3, 0x0000000); + } else if(ncore == 2) { + WriteReta(i, 0x1010100); + WriteReta(i+1, 0x1010100); + WriteReta(i+2, 0x1010100); + WriteReta(i+3, 0x1010100); + } else if(ncore == 4) { + WriteReta(i, 0x3020100); + WriteReta(i+1, 0x3020100); + WriteReta(i+2, 0x3020100); + WriteReta(i+3, 0x3020100); + } else if(ncore == 8) { + WriteReta(i, 0x3020100); + WriteReta(i+1, 0x7060504); + WriteReta(i+2, 0x3020100); + WriteReta(i+3, 0x7060504); + } else { + WriteReta(i, 0x3020100); + WriteReta(i+1, 0x7060504); + WriteReta(i+2, 0xB0A0908); + WriteReta(i+3, 0xF0E0D0C); + } } for (auto i = 0; i < 128; i++) { @@ -1486,7 +1582,8 @@ void ebbrt::IxgbeDriver::Init() { for (auto i = 1; i < 8; i++) { WriteRxpbsize(i, 0x0); } - WriteMrqc(0x0); + WriteMrqc(0x330001); + WritePfqde(0x0); WriteRtrup2tc(0x0); WriteMflcn(0x0 << 2); @@ -1570,6 +1667,8 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. #endif + // In NON-IOV, only psrtype[0] is used + WritePsrtype(0, 0x40001330); // Set head and tail pointers WriteRdt_1(i, 0x0); WriteRdh_1(i, 0x0); @@ -1583,6 +1682,8 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // setup RX interrupts for queue i dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); + ebbrt::kprintf("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); + // don't set up interrupts for tx since we have head writeback?? auto qn = i / 2; // put into correct IVAR @@ -1597,7 +1698,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // must be greater than rsc delay // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us - WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3)); + WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. // The hardware setting for interrupts 16...63 is always auto clear. @@ -1880,7 +1981,9 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, // bump head ptr ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - +#ifdef STATS_EN + ixgmq_.stat_num_rx ++; +#endif return 0; } #endif @@ -1896,7 +1999,9 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint32_t count; uint32_t rnt; process_rsc = false; - +#ifdef STATS_EN + ixgmq_.stat_num_itr ++; +#endif rxflag = 0; count = 0; rnt = 0; diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 1f142988..a8d23c4e 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -18,13 +18,17 @@ #include "Pci.h" #include "Pfn.h" #include "SlabAllocator.h" +#include "Perf.h" // Receive Side Scaling (RSC) enabled //#define RSC_EN // Direct Cache Access (DCA) enabled //#define DCA_ENABLE -//// Transmit Header Writeback enabled -#define TX_HEAD_WB +// Transmit Header Writeback enabled +//#define TX_HEAD_WB + +// Collect Statistics Flag +#define STATS_EN namespace ebbrt { @@ -214,6 +218,20 @@ class IxgbeDriver : public EthernetDevice { #else size_t tx_head_; #endif + + // stats + uint64_t stat_num_itr{0}; + uint64_t stat_num_send{0}; + uint64_t stat_num_rx{0}; + uint64_t stat_num_tx{0}; + + bool stat_perf{false}; + ebbrt::perf::PerfCounter perfCycles; + ebbrt::perf::PerfCounter perfInst; + ebbrt::perf::PerfCounter perfLLC_ref; + ebbrt::perf::PerfCounter perfLLC_miss; + ebbrt::perf::PerfCounter perfTLB_store_miss; + ebbrt::perf::PerfCounter perfTLB_load_miss; }; private: @@ -364,6 +382,10 @@ class IxgbeDriver : public EthernetDevice { void WriteDtxmxszrq(uint32_t m); void WriteMflcn(uint32_t m); void WriteReta(uint32_t n, uint32_t m); + void WriteRssrk(uint32_t n, uint32_t m) { + kassert(n < 10); + bar0_.Write32(0x0EB80 + 4 * n, m); + } void WritePsrtypeZero(uint32_t n); @@ -426,6 +448,8 @@ class IxgbeDriver : public EthernetDevice { uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr); void SendPacket(uint32_t n); + // dump per core stats if STATS_EN + void DumpStats(); e10k_queue_t& GetQueue() const { return *ixgq; } e10Kq& GetMultiQueue(size_t index) const { return *ixgmq[index]; } diff --git a/src/native/Net.h b/src/native/Net.h index 3568f0cd..33c501e9 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -46,6 +46,7 @@ struct PacketInfo { uint16_t gso_size{0}; uint16_t csum_start{0}; uint16_t csum_offset{0}; + bool get_stats{false}; }; class EthernetDevice { diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index 6ecfde0d..e52076ce 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -48,7 +48,8 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( PacketInfo pinfo; pinfo.flags = 0; - + // hijacking ping to dump ixgbe statistics + pinfo.get_stats = true; #ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // hardware ip checksum offload pinfo.flags |= PacketInfo::kNeedsIpCsum; From 68caa4009c6e18e96e265d4dcb275c36d39df612 Mon Sep 17 00:00:00 2001 From: Han Date: Mon, 4 Nov 2019 14:32:31 -0500 Subject: [PATCH 10/20] recursive page table walker --- src/native/VMemAllocator.cc | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/native/VMemAllocator.cc b/src/native/VMemAllocator.cc index 173fdc23..e61df2a5 100644 --- a/src/native/VMemAllocator.cc +++ b/src/native/VMemAllocator.cc @@ -118,6 +118,22 @@ ebbrt::VMemAllocator::Alloc(size_t npages, size_t pages_align, npages); } +// Recursive page table walker +struct Frame { + struct Frame *next; + uint64_t rip; +}; + +void dumpFrames(int n, struct Frame *s) +{ + struct Frame *f; + int i; + for (i=0,f=s; inext,i++) { + ebbrt::kprintf("FRAME[%d]: %p RIP: 0x%llx next:%p\n", -1*i, f, + f->rip, f->next); + } +} + void ebbrt::VMemAllocator::HandlePageFault(idt::ExceptionFrame* ef) { std::lock_guard lock(lock_); auto fault_addr = ReadCr2(); From 5c69645c0f851bf7fc249c95bb9da7ac149eb383 Mon Sep 17 00:00:00 2001 From: Han Date: Mon, 4 Nov 2019 14:34:19 -0500 Subject: [PATCH 11/20] keeping track of changes --- src/native/Cpu.cc | 2 +- src/native/IxgbeDriver.cc | 87 +++++++++++++++++++++++++++++++++------ src/native/IxgbeDriver.h | 18 +++++--- src/native/NetIcmp.cc | 2 +- 4 files changed, 89 insertions(+), 20 deletions(-) diff --git a/src/native/Cpu.cc b/src/native/Cpu.cc index 38ea019f..cce44a3e 100644 --- a/src/native/Cpu.cc +++ b/src/native/Cpu.cc @@ -61,4 +61,4 @@ ebbrt::Cpu* ebbrt::Cpu::GetByApicId(size_t apic_id) { return &(*it); } -size_t ebbrt::Cpu::Count() { return cpus->size(); } +size_t ebbrt::Cpu::Count() { return 16; } //return cpus->size(); } diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 76320fb6..69b927b0 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -63,7 +63,7 @@ void ebbrt::IxgbeDriver::DumpStats() { ixgmq[i]->stat_num_rx = 0; ixgmq[i]->stat_num_tx = 0; - if(ixgmq[i]->stat_perf == false) { + /*if(ixgmq[i]->stat_perf == false) { ixgmq[i]->perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); ixgmq[i]->perfCycles.Start(); ixgmq[i]->perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); @@ -80,8 +80,8 @@ void ebbrt::IxgbeDriver::DumpStats() { ebbrt::kprintf("\t instructions:%llf\n", inst); ebbrt::kprintf("\t ipc: %llf\n", inst/cyc); ixgmq[i]->stat_perf = false; - } - } + }*/ + } } void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { @@ -334,6 +334,19 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { } else { AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); } + + // dump eth packet info + ebbrt::kprintf("\t Send() on core: %d len=%d\n", mcore, len); + auto p1 = reinterpret_cast(data); + for (int i = 0; i < (int)len; i+=8) { + if (i+8 < (int)len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + else{ + ebbrt::kprintf("%02X\n", p1[i]); + } + } + ebbrt::kprintf("\n"); } // bump tx_tail @@ -1527,13 +1540,49 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x7060504); } else { - WriteReta(i, 0x3020100); - WriteReta(i+1, 0x7060504); - WriteReta(i+2, 0xB0A0908); - WriteReta(i+3, 0xF0E0D0C); + WriteReta(i, 0x3020100); //8 + WriteReta(i+1, 0x7060504); //8 + WriteReta(i+2, 0xB0A0908); // 8 + //WriteReta(i+3, 0x3020100); + //WriteReta(i+2, 0xB0A0908); + //WriteReta(i+3, 0xF0E0D0C); } } + //temp +/* WriteReta(3, 0x3020100); + WriteReta(7, 0x7060504); + WriteReta(11, 0xB0A0908); + WriteReta(15, 0x3020100); + WriteReta(19, 0x7060504); + WriteReta(23, 0xB0A0908); + WriteReta(27, 0x3020100); + WriteReta(31, 0x7060504); + WriteReta(35, 0xB0A0908); + WriteReta(39, 0x3020100); + WriteReta(43, 0x7060504); + WriteReta(47, 0xB0A0908); + WriteReta(51, 0x3020100); + WriteReta(55, 0x7060504); + WriteReta(59, 0xB0A0908); + WriteReta(63, 0x3020100); + WriteReta(67, 0x7060504); + WriteReta(71, 0xB0A0908); + WriteReta(75, 0x3020100); + WriteReta(79, 0x7060504); + WriteReta(83, 0xB0A0908); + WriteReta(87, 0x3020100); + WriteReta(91, 0x7060504); + WriteReta(95, 0xB0A0908); + WriteReta(99, 0x3020100); + WriteReta(103, 0x7060504); + WriteReta(107, 0xB0A0908); + WriteReta(111, 0x3020100); + WriteReta(115, 0x7060504); + WriteReta(119, 0xB0A0908); + WriteReta(123, 0x3020100); + WriteReta(127, 0x3080400);*/ + for (auto i = 0; i < 128; i++) { WriteFtqf(i, 0x0); WriteSaqf(i, 0x0); @@ -2005,6 +2054,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { rxflag = 0; count = 0; rnt = 0; + uint32_t mcore = static_cast(Cpu::GetMine()); // get address of buffer with data while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt) == 0) { @@ -2037,6 +2087,8 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ReclaimRx(); + ebbrt::kprintf("\t ReceivePoll() RSC on core: %d len=%d\n", mcore, static_cast(rsclen)); + root_.itf_.Receive(std::move(b), rxflag); } else { // done with buffer addr above, now to reuse it @@ -2049,7 +2101,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { if (count > 0) { auto tail = ixgmq_.rx_tail_; - + // TODO hack - need to set actual length of data otherwise it'll send // leftover 0's ixgmq_.circ_buffer_[tail]->SetLength(len); @@ -2064,15 +2116,26 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.rx_ring_[tail].buffer_address = rxphys; - root_.itf_.Receive(std::move(b), rxflag); + // dump eth packet info + ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); + auto p1 = reinterpret_cast(b->MutData()); + for (int i = 0; i < (int)len; i+=8) { + if (i+8 < (int)len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + else{ + ebbrt::kprintf("%02X\n", p1[i]); + } + } + root_.itf_.Receive(std::move(b), rxflag); } - } + } } - + // TODO: Update tail register here or above? if (count > 0) { // update reg - WriteRdt_1(Cpu::GetMine(), ixgmq_.rx_tail_); + WriteRdt_1(mcore, ixgmq_.rx_tail_); } } diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index a8d23c4e..0bd6cfd1 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -28,7 +28,8 @@ //#define TX_HEAD_WB // Collect Statistics Flag -#define STATS_EN +//#define STATS_EN +//#define MAX_DESC namespace ebbrt { @@ -95,14 +96,19 @@ class IxgbeDriver : public EthernetDevice { * against the system mbuf pool limit, you can tune nmbclusters * to adjust for this. */ - // Linux Defaults + +#ifdef MAX_DESC + static const constexpr uint32_t NTXDESCS = 8192; + static const constexpr uint32_t NRXDESCS = 8192; +#else static const constexpr uint32_t NTXDESCS = 512; - static const constexpr uint32_t NRXDESCS = 512; + static const constexpr uint32_t NRXDESCS = 512; +#endif + + // Linux Defaults static const constexpr uint32_t RXBUFSZ = 2048; static const constexpr uint32_t BSIZEHEADER = 256; - - //static const constexpr uint32_t NTXDESCS = 8192; - //static const constexpr uint32_t NRXDESCS = 8192; + //static const constexpr uint32_t RXBUFSZ = 4096; //static const constexpr uint32_t RXBUFSZ = 16384; diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index e52076ce..3012637d 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -49,7 +49,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( PacketInfo pinfo; pinfo.flags = 0; // hijacking ping to dump ixgbe statistics - pinfo.get_stats = true; + pinfo.get_stats = false; #ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // hardware ip checksum offload pinfo.flags |= PacketInfo::kNeedsIpCsum; From 5b98f2940eac8049b1c3c436a8514d6bff724b27 Mon Sep 17 00:00:00 2001 From: Han Date: Thu, 5 Dec 2019 22:31:15 -0500 Subject: [PATCH 12/20] Finally working on multiple core (i think) for memcached. Major fix was checksum offloading for packets less than MTU, which doesn't utilize the TSO offload feature, is still the original standard TCP pseudo checksum of src, dest ip addresses, TCP protocol, and length (tcp header+payload). However, for packets larger than the MTU, which requires enabling TSO, the TCP pseudo checksum is just src, dest ip address and TCP protocol, no length required. If this is not implemented correctly, you know it is a device problem as the packets won't even make it out onto wireshark. --- src/native/Cpu.cc | 2 +- src/native/Ixgbe.h | 6 +- src/native/IxgbeDriver.cc | 844 ++++++++++++++++++++++++++++++++------ src/native/IxgbeDriver.h | 22 +- src/native/Net.cc | 18 +- src/native/Net.h | 2 + src/native/NetChecksum.cc | 6 +- src/native/NetChecksum.h | 1 + src/native/NetIcmp.cc | 10 +- src/native/NetIp.cc | 2 +- src/native/NetTcp.cc | 169 +++++++- src/native/NetUdp.cc | 52 ++- 12 files changed, 987 insertions(+), 147 deletions(-) diff --git a/src/native/Cpu.cc b/src/native/Cpu.cc index cce44a3e..38ea019f 100644 --- a/src/native/Cpu.cc +++ b/src/native/Cpu.cc @@ -61,4 +61,4 @@ ebbrt::Cpu* ebbrt::Cpu::GetByApicId(size_t apic_id) { return &(*it); } -size_t ebbrt::Cpu::Count() { return 16; } //return cpus->size(); } +size_t ebbrt::Cpu::Count() { return cpus->size(); } diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h index b2ed523b..f187386e 100644 --- a/src/native/Ixgbe.h +++ b/src/native/Ixgbe.h @@ -36,6 +36,10 @@ #define IXGBE_EITR_CNT_WDIS 0x80000000 +// max transmit sizes +#define IXGBE_MAX_TXD_PWR 14 +#define IXGBE_MAX_DATA_PER_TXD (1u << IXGBE_MAX_TXD_PWR) + enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; #define ETHHDR_LEN 14 @@ -277,7 +281,7 @@ typedef union { uint64_t fcoe : 1; uint64_t rsvd_2 : 4; - uint64_t dytp : 4; + uint64_t dtyp : 4; uint64_t rsvd_3 : 5; uint64_t dext : 1; diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 69b927b0..961c985a 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -19,6 +19,23 @@ #include #include +void dumpPacketContents(uint8_t* p1, uint64_t len) { + uint64_t i, j; + + ebbrt::kprintf_force("dumpPacketContents() len=%u\n", len); + for (i = 0; i < len; i+=8) { + if (i+8 < len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } else { + for(j=i;jiplen = iplen; actx->maclen = maclen; - actx->dytp = 0b0010; + actx->dtyp = 0b0010; actx->dext = 1; actx->idx = idx; @@ -226,6 +243,346 @@ void ebbrt::IxgbeDriverRep::AddTx(uint64_t pa, uint64_t len, } void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + uint64_t data, len, tsodata, tsolen; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + tdesc_advance_ctxt_wb_t* actx; + uint32_t mcore = static_cast(Cpu::GetMine()); + //int i; + + // On TSO, the maximum PAYLEN can be up to 2^18 - 1 + len = buf->ComputeChainDataLength(); + if (len > 262144) { + ebbrt::kprintf_force("\t kabort Send() len=%u greater than TSO limit of 262144 bytes\n", len); + return; + } + + if(buf->IsChained()) { + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + data = reinterpret_cast(b->MutData()); + } else { + data = reinterpret_cast(buf->Data()); + } + + // if no IP/TCP checksum + if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + + // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. + arfx->paylen = len; + + // crc checksum + arfx->ifcs = 1; + + // rs bit should only be set when eop is set + arfx->eop = 1; + arfx->rs = 1; + + // type is advanced + arfx->dtyp = 0x3; + arfx->dext = 1; + + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } + else { + if(len > IXGBE_MAX_DATA_PER_TXD) { + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0x2 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + + //first descriptor + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; + arfx->dtyp = 0x3; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length + arfx->paylen = pinfo.tcp_len; + //ebbrt::kprintf("Send() first descriptor mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + tsodata = data; + tsolen = len; + + while(tsolen > IXGBE_MAX_DATA_PER_TXD) { + tsodata += IXGBE_MAX_DATA_PER_TXD; + tsolen -= IXGBE_MAX_DATA_PER_TXD; + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->dtyp = 0x3; + arfx->dext = 1; + arfx->tse = 1; + arfx->ifcs = 1; + arfx->address = tsodata; + + if(tsolen > IXGBE_MAX_DATA_PER_TXD) { + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; + //ebbrt::kprintf("Send() middle descriptor(s) mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { + // last descriptor + arfx->dtalen = tsolen; + arfx->eop = 1; + arfx->rs = 1; + + //ebbrt::kprintf("Send() last descriptor mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, tsolen, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } + } + + } + else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size + arfx->paylen = pinfo.tcp_len; + //ebbrt::kprintf("Send mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux, ignored when no TSE + actx->mss = 0; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = 0; + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->paylen = len; + + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->ifcs = 1; + + arfx->rs = 1; + + arfx->dext = 1; + arfx->tse = 0; + + arfx->ixsm = 1; + + // if need TCP checksum offload + //if (pinfo.flags & PacketInfo::kNeedsCsum) { + arfx->txsm = 1; + //} + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } + } + /*if(len > 1448) { + //dumpPacketContents(reinterpret_cast(data), len); + //ebbrt::kabort("kabort Send()\n"); + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + // l4type = tcp: 01 + actx->l4t = 1; + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux + actx->mss = 1448; + // TCP header length, with no tcp options == 20 + actx->l4len = pinfo.tcp_hdr_len; + + ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size + arfx->paylen = pinfo.tcp_len; + + ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + //ebbrt::kabort("Exiting\n"); + + } else { + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + + // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. + arfx->paylen = len; + + // crc checksum + arfx->ifcs = 1; + + // rs bit should only be set when eop is set + arfx->eop = 1; + arfx->rs = 1; + + // type is advanced + arfx->dtyp = 0x3; + arfx->dext = 1; + + ebbrt::kprintf("Send mcore=%u tail=%u tcp_hdr_len=%u tcp_len=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, pinfo.tcp_hdr_len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + }*/ + + + // Force memory writes to complete before letting h/w know there + // are new descriptors to fetch. (Only applicable for weak-ordered + // memory model archs, such as IA-64). + asm volatile("sfence" ::: "memory"); + + WriteTdt_1(mcore, ixgmq_.tx_tail_); + + // keep looping until processed + while(arfx->dd == 0) { + // makes sure all reads are finished before checking again + asm volatile("lfence":::"memory"); + } + + //rtdh = ReadTdh_1(mcore); + //rtdt = ReadTdt_1(mcore); + //ebbrt::kprintf("\t Send() core=%u After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", mcore, len, rtdh, rtdt, ixgmq_.tx_tail_); +} + +/*void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { bool ip_cksum = false; bool tcpudp_cksum = false; uint64_t data; @@ -251,12 +608,13 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); // not enough descriptors got freed if (free_desc < (count + 1)) { + ebbrt::kprintf("Not enough descriptors got freed\n"); return; } } #endif - if (pinfo.flags & PacketInfo::kNeedsIpCsum) { +if (pinfo.flags & PacketInfo::kNeedsIpCsum) { ip_cksum = true; } @@ -270,11 +628,12 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { count = buf->CountChainElements(); if(tcpudp_cksum) { - if (pinfo.csum_offset == 6) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); } else if (pinfo.csum_offset == 16) { AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); } else { + ebbrt::kprintf("%s unknown packet type checksum\n", __FUNCTION__); ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); } } @@ -319,7 +678,6 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { } else { // buffers NOT chained data = reinterpret_cast(buf->Data()); len = buf->ComputeChainDataLength(); - if(tcpudp_cksum) { // check datasheet for numbers if (pinfo.csum_offset == 6) { @@ -336,7 +694,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { } // dump eth packet info - ebbrt::kprintf("\t Send() on core: %d len=%d\n", mcore, len); + auto rtdh = ReadTdh_1(mcore); + auto rtdt = ReadTdt_1(mcore); + ebbrt::kprintf("\t Send() Before len=%d rtdh=%u %rtdt=%u tx_tail_=%u\n", len, rtdh, rtdt, (uint32_t)(ixgmq_.tx_tail_)); auto p1 = reinterpret_cast(data); for (int i = 0; i < (int)len; i+=8) { if (i+8 < (int)len) { @@ -351,10 +711,100 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // bump tx_tail // indicates position beyond last descriptor hw - size_t tail = ixgmq_.tx_tail_; + uint32_t tail = (uint32_t)(ixgmq_.tx_tail_); + asm volatile("sfence" ::: "memory"); + WriteTdt_1(mcore, tail); -} + tdesc_advance_tx_rf_t* actx; + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail-1])); + + while(actx->dd == 0) { + ebbrt::clock::SleepMilli(1); + } + auto rtdh = ReadTdh_1(mcore); + auto rtdt = ReadTdt_1(mcore); + ebbrt::kprintf("\t Send() After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", len, rtdh, rtdt, tail); + }*/ +/*void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + bool ip_cksum = false; + bool tcpudp_cksum = false; + uint64_t data; + size_t len; + uint32_t mcore = static_cast(Cpu::GetMine()); + std::unique_ptr b; + + if (pinfo.flags & PacketInfo::kNeedsIpCsum) { + ip_cksum = true; + } + + if (pinfo.flags & PacketInfo::kNeedsCsum) { + tcpudp_cksum = true; + } + + len = buf->ComputeChainDataLength(); + + // buffers are chained + if(buf->IsChained()) { + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + data = reinterpret_cast(b->MutData()); + } else { // buffers NOT chained + data = reinterpret_cast(buf->Data()); + } + + if(tcpudp_cksum) { + // check datasheet for numbers + if (pinfo.csum_offset == 6) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); + } else if (pinfo.csum_offset == 16) { + AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); + } else { + ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); + } + + AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); + } else { + AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); + } + + // dump eth packet info + auto rtdh = ReadTdh_1(mcore); + auto rtdt = ReadTdt_1(mcore); + ebbrt::kprintf("\t Core=%u Send() Before len=%d rtdh=%u %rtdt=%u tx_tail_=%u\n", mcore, len, rtdh, rtdt, (uint32_t)(ixgmq_.tx_tail_)); + auto p1 = reinterpret_cast(data); + for (int i = 0; i < (int)len; i+=8) { + if (i+8 < (int)len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X \n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + else{ + ebbrt::kprintf("%02X \n", p1[i]); + } + } + ebbrt::kprintf("\n"); + + // bump tx_tail + // indicates position beyond last descriptor hw + uint32_t tail = (uint32_t)(ixgmq_.tx_tail_); + asm volatile("sfence" ::: "memory"); + + WriteTdt_1(mcore, tail); + + tdesc_advance_tx_rf_t* actx; + actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail-1])); + + while(actx->dd == 0) { + ebbrt::clock::SleepMilli(1); + } + rtdh = ReadTdh_1(mcore); + rtdt = ReadTdt_1(mcore); + ebbrt::kprintf("\t Send() After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", len, rtdh, rtdt, tail); + } +*/ void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { // Disable RXCTRL - 8.2.3.8.10 @@ -372,9 +822,15 @@ void ebbrt::IxgbeDriver::WriteDmatxctl(uint32_t m) { } void ebbrt::IxgbeDriver::WriteDmatxctl_te(uint32_t m) { auto reg = bar0_.Read32(0x04A80); + ebbrt::kprintf("DMATXCTL= 0x%X\n", reg | m); bar0_.Write32(0x04A80, reg | m); } +//8.2.3.5.10 Extended Interrupt Auto Mask Enable registers — EIAM[n] (0x00AD0 + 4*(n-1), n=1...2; RW) +void ebbrt::IxgbeDriver::WriteEiam(uint32_t n, uint32_t m) { + bar0_.Write32(0x00AD0 + 4*n, m); +} + // 8.2.3.5.18 - General Purpose Interrupt Enable — GPIE (0x00898; RW) void ebbrt::IxgbeDriver::WriteGpie(uint32_t m) { auto reg = bar0_.Read32(0x00898); @@ -487,8 +943,9 @@ void ebbrt::IxgbeDriver::ReadCtrl() { // 8.2.3.1.3 Extended Device Control Register — CTRL_EXT (0x00018; RW) void ebbrt::IxgbeDriver::WriteCtrlExt(uint32_t m) { - auto reg = bar0_.Read32(0x00018); - bar0_.Write32(0x00018, reg | m); + //auto reg = bar0_.Read32(0x00018); + //bar0_.Write32(0x00018, reg | m); + bar0_.Write32(0x00018, m); } // 8.2.3.7.1 Filter Control Register — FCTRL (0x05080; RW) @@ -787,6 +1244,10 @@ void ebbrt::IxgbeDriver::WriteDcaTxctrlTxdescWbro(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x0600C + 0x40 * n); bar0_.Write32(0x0600C + 0x40 * n, reg & m); } +void ebbrt::IxgbeDriver::ReadDcaTxctrl(uint32_t n) { + auto reg = bar0_.Read32(0x0600C + 0x40 * n); + ebbrt::kprintf("DCA_TXCTRL=0x%X\n", reg); +} // 8.2.3.11.1 Rx DCA Control Register — DCA_RXCTRL[n] (0x0100C + 0x40*n, // n=0...63 and 0x0D00C + 0x40*(n-64), @@ -795,8 +1256,11 @@ void ebbrt::IxgbeDriver::WriteDcaRxctrl_1(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x0100C + 0x40 * n); bar0_.Write32(0x0100C + 0x40 * n, reg & m); } +void ebbrt::IxgbeDriver::ReadDcaRxctrl(uint32_t n) { + auto reg = bar0_.Read32(0x0100C + 0x40 * n); + ebbrt::kprintf("DCA_RXCTRL=0x%X\n", reg); +} -// void ebbrt::IxgbeDriver::WriteDcaRxctrl_1_RxdataWrro(uint32_t n, uint32_t m); void ebbrt::IxgbeDriver::WriteDcaRxctrl_2(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x0D00C + 0x40 * n); bar0_.Write32(0x0D00C + 0x40 * n, reg & m); @@ -804,8 +1268,9 @@ void ebbrt::IxgbeDriver::WriteDcaRxctrl_2(uint32_t n, uint32_t m) { // 8.2.3.7.5 Receive Checksum Control — RXCSUM (0x05000; RW) void ebbrt::IxgbeDriver::WriteRxcsum(uint32_t m) { - auto reg = bar0_.Read32(0x05000); - bar0_.Write32(0x05000, reg | m); + //auto reg = bar0_.Read32(0x05000); + //bar0_.Write32(0x05000, reg | m); + bar0_.Write32(0x05000, m); } // 8.2.3.8.13 RSC Control — RSCCTL[n] (0x0102C + 0x40*n, n=0...63 @@ -818,19 +1283,22 @@ void ebbrt::IxgbeDriver::WriteRscctl(uint32_t n, uint32_t m) { // 8.2.3.7.4 Packet Split Receive Type Register — PSRTYPE[n] // (0x0EA00 + 4*n, n=0...63 / 0x05480 + 4*n, n=0...15; RW) void ebbrt::IxgbeDriver::WritePsrtype(uint32_t n, uint32_t m) { - auto reg = bar0_.Read32(0x0EA00 + 0x40 * n); + //auto reg = bar0_.Read32(0x0EA00 + 0x40 * n); + auto reg = bar0_.Read32(0x05480 + 0x40 * n); bar0_.Write32(0x0EA00 + 0x40 * n, reg | m); } void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { - bar0_.Write32(0x0EA00 + 0x40 * n, 0x0); + //bar0_.Write32(0x0EA00 + 0x40 * n, 0x0); + bar0_.Write32(0x05480, n); } // 8.2.3.7.15 Redirection Table — RETA[n] (0x0EB00 + 4*n, n=0...31/ 0x05C00 + // 4*n, n=0...31; RW) void ebbrt::IxgbeDriver::WriteReta(uint32_t n, uint32_t m) { - bar0_.Write32(0x0EB00 + 4 * n, m); - ebbrt::kprintf("WriteReta(n=%d) = 0x%08X\n", n, m); + //bar0_.Write32(0x0EB00 + 4 * n, m); + bar0_.Write32(0x05C00 + 4 * n, m); + ebbrt::kprintf("WriteReta(n=%d) %X = 0x%08X\n", n, 0x05C00 + 4 * n, m); } // 8.2.3.7.6 Receive Filter Control Register — RFCTL (0x05008; RW) @@ -845,8 +1313,9 @@ void ebbrt::IxgbeDriver::WriteTxpbthresh(uint32_t n, uint32_t m) { // 8.2.3.7.12 Multiple Receive Queues Command Register- MRQC (0x0EC80 / 0x05818; // RW) void ebbrt::IxgbeDriver::WriteMrqc(uint32_t m) { - auto reg = bar0_.Read32(0x0EC80); - bar0_.Write32(0x0EC80, reg | m); + //auto reg = bar0_.Read32(0x0EC80); + //bar0_.Write32(0x0EC80, reg | m); + bar0_.Write32(0x05818, m); } // 8.2.3.9.15 Multiple Transmit Queues Command Register — MTQC (0x08120; RW) @@ -1006,6 +1475,9 @@ uint16_t ebbrt::IxgbeDriver::ReadTdh(uint32_t n) { auto reg = bar0_.Read32(0x06010 + 0x40 * n); return reg & 0xFFFF; } +uint32_t ebbrt::IxgbeDriver::ReadTdt(uint32_t n) { + return bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; +} // 8.2.3.9.11 Tx Descriptor Completion Write Back Address Low — // TDWBAL[n] (0x06038+0x40*n, n=0...127; RW) @@ -1036,8 +1508,8 @@ void ebbrt::IxgbeDriver::WriteRdlen_2(uint32_t n, uint32_t m) { // n=0...63 and 0x0D014 + 0x40*(n-64), n=64...127 / 0x02100 + 4*n, [n=0...15]; // RW) void ebbrt::IxgbeDriver::WriteSrrctl_1(uint32_t n, uint32_t m) { - auto reg = bar0_.Read32(0x01014 + 0x40 * n); - bar0_.Write32(0x01014 + 0x40 * n, reg | m); + //auto reg = bar0_.Read32(0x01014 + 0x40 * n); + bar0_.Write32(0x01014 + 0x40 * n, m); } void ebbrt::IxgbeDriver::WriteSrrctlZero(uint32_t n) { bar0_.Write32(0x01014 + 0x40 * n, 0x0); @@ -1045,8 +1517,8 @@ void ebbrt::IxgbeDriver::WriteSrrctlZero(uint32_t n) { // 8.2.3.8.12 RSC Data Buffer Control Register — RSCDBU (0x03028; RW) void ebbrt::IxgbeDriver::WriteRscdbu(uint32_t m) { - auto reg = bar0_.Read32(0x03028); - bar0_.Write32(0x03028, reg | m); + //auto reg = bar0_.Read32(0x03028); + bar0_.Write32(0x03028, m); } void ebbrt::IxgbeDriver::WriteSrrctl_1_desctype(uint32_t n, uint32_t m) { @@ -1090,8 +1562,8 @@ void ebbrt::IxgbeDriver::WriteTxpbThresh(uint32_t n, uint32_t m) { // 8.2.3.22.8 MAC Core Control 0 Register — HLREG0 (0x04240; RW) void ebbrt::IxgbeDriver::WriteHlreg0(uint32_t m) { - auto reg = bar0_.Read32(0x04240); - bar0_.Write32(0x04240, reg | m); + //auto reg = bar0_.Read32(0x04240); + bar0_.Write32(0x04240, m); } // 8.2.3.8.5 Receive Descriptor Tail — RDT[n] (0x01018 + 0x40*n, n=0...63 and @@ -1133,10 +1605,12 @@ void ebbrt::IxgbeDriver::SwfwSemRelease() { void ebbrt::IxgbeDriver::WriteIvarAlloc0(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x00900 + 4 * n); bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); } void ebbrt::IxgbeDriver::WriteIvarAllocval0(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x00900 + 4 * n); bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); } void ebbrt::IxgbeDriver::WriteIvarAlloc1(uint32_t n, uint32_t m) { @@ -1151,10 +1625,12 @@ void ebbrt::IxgbeDriver::WriteIvarAllocval1(uint32_t n, uint32_t m) { void ebbrt::IxgbeDriver::WriteIvarAlloc2(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x00900 + 4 * n); bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); } void ebbrt::IxgbeDriver::WriteIvarAllocval2(uint32_t n, uint32_t m) { auto reg = bar0_.Read32(0x00900 + 4 * n); bar0_.Write32(0x00900 + 4 * n, reg | m); + ebbrt::kprintf("IVAR: 0x%X 0x%X\n", 0x00900 + 4 * n, reg | m); } void ebbrt::IxgbeDriver::WriteIvarAlloc3(uint32_t n, uint32_t m) { @@ -1169,8 +1645,8 @@ void ebbrt::IxgbeDriver::WriteIvarAllocval3(uint32_t n, uint32_t m) { // 8.2.3.10.2 DCB Transmit Descriptor Plane Control and Status — RTTDCS // (0x04900; RW) DMA-Tx void ebbrt::IxgbeDriver::WriteRttdcs(uint32_t m) { - auto reg = bar0_.Read32(0x04900); - bar0_.Write32(0x04900, reg | m); + //auto reg = bar0_.Read32(0x04900); + bar0_.Write32(0x04900, m); } void ebbrt::IxgbeDriver::WriteRttdcsArbdisEn(uint32_t m) { auto reg = bar0_.Read32(0x04900); @@ -1182,12 +1658,13 @@ void ebbrt::IxgbeDriver::WriteRttpcs(uint32_t m) { bar0_.Write32(0x0CD00, m); } // 8.2.3.12.5 Security Rx Control — SECRXCTRL (0x08D00; RW) void ebbrt::IxgbeDriver::WriteSecrxctrl_Rx_Dis(uint32_t m) { - auto reg = bar0_.Read32(0x08D00); + bar0_.Write32(0x08D00, m); + /*auto reg = bar0_.Read32(0x08D00); if (m) { bar0_.Write32(0x08D00, reg | m); } else { bar0_.Write32(0x08D00, reg & ~(0x1 << 1)); - } + }*/ } // 8.2.3.12.6 Security Rx Status — SECRXSTAT (0x08D04; RO) @@ -1290,7 +1767,8 @@ void ebbrt::IxgbeDriver::StopDevice() { WriteDmatxctl(0xFFFFFFFE); // disable interrupts - WriteEimc(0x7FFFFFFF); + //WriteEimc(0x7FFFFFFF); + WriteEimc(0xFFFFFFFF); ReadEicr(); // disable each rx and tx queue @@ -1356,15 +1834,20 @@ void ebbrt::IxgbeDriver::Init() { ebbrt::clock::SleepMilli(250); // disable interrupts - WriteEimc(0x7FFFFFFF); + //WriteEimc(0x7FFFFFFF); + WriteEimc(0xFFFFFFFF); ReadEicr(); // Let firmware know we have taken over - WriteCtrlExt(0x1 << 28); // DRV_LOAD + //WriteCtrlExt(0x1 << 28); // DRV_LOAD + WriteCtrlExt(0x10010000); // DRV_LOAD and NS_DIS + //If legacy descriptors are used, this bit should be set to 1b. // No snoop disable from FreeBSD ?? - WriteCtrlExt(0x1 << 16); // NS_DIS - +//#ifndef RSC_EN +// WriteCtrlExt(0x1 << 16); // NS_DIS +//#endif + // Initialize flow-control registers for (auto i = 0; i < 8; i++) { if (i < 4) { @@ -1410,16 +1893,23 @@ void ebbrt::IxgbeDriver::Init() { // clears on read WriteEicr(0xFFFFFFFF); + /* + * use EIAM to auto-mask when MSI-X interrupt is asserted + * this saves a register write for every interrupt + */ + //WriteEiam(0, 0xFFFFFFFF); + //WriteEiam(1, 0xFFFFFFFF); + /* setup msix */ // switch to msix mode - WriteGpie(0x1 << 4); // Multiple_MSIX - WriteGpie(0x1 << 31); // PBA_support + WriteGpie(0x1 << 4); // Multiple_MSIX WriteGpie(0x1 << 5); // OCD - - // TODO: Set up management interrupt handler - + WriteGpie(0x1 << 31); // PBA_support // Enable auto masking of interrupt WriteGpie(0x1 << 30); // EIAME + + // TODO: Set up management interrupt handler + #ifdef RSC_EN // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us @@ -1484,25 +1974,29 @@ void ebbrt::IxgbeDriver::Init() { // TODO VLNCTRL WriteMcstctrl(0x0); -#ifndef RSC_EN - WriteRxcsum(0x1 << 12); // IP payload checksum enable -#else +//#ifndef RSC_EN + //WriteRxcsum(0x1 << 12); // IP payload checksum enable + //WriteRxcsum(0x3 << 12); // IP payload checksum enable + WriteRxcsum(0x3000); +//#else // note: PCSD: The Fragment Checksum and IP Identification fields are mutually exclusive with // the RSS hash. Only one of the two options is reported in the Rx descriptor. - WriteRxcsum(0x2000); -#endif +// WriteRxcsum(0x2000); +//#endif // TODO RQTC #ifdef RSC_EN - WriteRfctl(0x0); + WriteRfctl(0xC0); #else - WriteRfctl(0x1 << 5); + WriteRfctl(0xE0); #endif for (auto i = 0; i < 256; i++) { WriteMpsar(i, 0x0); } + // !! Support for RSS is not provided when legacy receive descriptor format is used. + // RSSRK - random seeds taken from Linux WriteRssrk(0, 0xA38DD80F); WriteRssrk(1, 0xD107C3DC); @@ -1518,14 +2012,27 @@ void ebbrt::IxgbeDriver::Init() { // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to // hacky atm for (auto i = 0; i < 32; i += 4) { + if(ncore > 0) { + //WriteReta(i, 0x0000000); + //WriteReta(i+1, 0x0000000); + //WriteReta(i+2, 0x0000000); + //WriteReta(i+3, 0x0000000); + WriteReta(i, 0x03020100); + WriteReta(i+1, 0x07060504); + WriteReta(i+2, 0x0B0A0908); + WriteReta(i+3, 0x0F0E0D0C); + } + /*if(ncore > 0) { + WriteReta(i, 0x01010101); + }*/ // all route to core 0 - if(ncore == 1) { + /*if(ncore == 1) { WriteReta(i, 0x0000000); WriteReta(i+1, 0x0000000); WriteReta(i+2, 0x0000000); WriteReta(i+3, 0x0000000); } else if(ncore == 2) { - WriteReta(i, 0x1010100); + WriteReta(i, 0x1010100); WriteReta(i+1, 0x1010100); WriteReta(i+2, 0x1010100); WriteReta(i+3, 0x1010100); @@ -1540,13 +2047,13 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x7060504); } else { - WriteReta(i, 0x3020100); //8 + WriteReta(i, 0x3020100); //8 WriteReta(i+1, 0x7060504); //8 WriteReta(i+2, 0xB0A0908); // 8 //WriteReta(i+3, 0x3020100); //WriteReta(i+2, 0xB0A0908); //WriteReta(i+3, 0xF0E0D0C); - } + }*/ } //temp @@ -1597,11 +2104,15 @@ void ebbrt::IxgbeDriver::Init() { // Make sure RX CRC strip enabled in HLREG0 and RDRXCTL WriteRdrxctlRSCFRSTSIZE(~(0x1F << 17)); // s/w set to 0 WriteRdrxctl(0x1 << 1); // CRCStrip - WriteHlreg0(0x1 << 1); // CRCStrip + //WriteHlreg0(0x1 << 1); // CRCStrip + WriteHlreg0(0x2FFF); // CRCStrip WriteRdrxctl(0x1 << 25); // RSCACKC s/w set to 1 WriteRdrxctl(0x1 << 26); // FCOE_WRFIX s/w set to 1 - // TODO RSCDBU + // Disable RSC for ACK Packets, disables the coalescing of TCP packets without TCP payload. + // This bit should be set if performance problems are found. + WriteRscdbu(0xa0); + /***** END RX FILTER *****/ // Configure buffers etc. according to specification @@ -1612,7 +2123,8 @@ void ebbrt::IxgbeDriver::Init() { * to the DCB and virtualization modes (see Section 4.6.11.3). * Clear RTTDCS.ARBDIS to 0b. */ - WriteRttdcs(0x1 << 6); + //WriteRttdcs(0x1 << 6); + WriteRttdcs(0xC00040); WriteDtxmxszrq(MAX_BYTES_NUM_REQ); WriteTxpbsize(0, 0xA0 << 10); WriteTxpbThresh(0, 0xA0); @@ -1622,7 +2134,8 @@ void ebbrt::IxgbeDriver::Init() { } WriteMtqc(0x0); WriteMngtxmap(0x0); - WriteRttdcsArbdisEn(~(0x1 << 6)); + WriteRttdcs(0xC00000); + //WriteRttdcsArbdisEn(~(0x1 << 6)); /* Receive Init: Program RXPBSIZE, MRQC, PFQDE, RTRUP2TC, MFLCN.RPFCE, * and MFLCN.RFCE according to the DCB and virtualization modes @@ -1642,7 +2155,7 @@ void ebbrt::IxgbeDriver::Init() { // TODO Enable Jumbo Packets // disable relaxed ordering - for (auto i = 0; i < 128; i++) { + /*for (auto i = 0; i < 128; i++) { WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); // Txdesc_Wbro if (i < 64) { @@ -1656,7 +2169,7 @@ void ebbrt::IxgbeDriver::Init() { WriteDcaRxctrl_2( i - 64, ~(0x1 << 13)); // Rx data Write Relax Order Enable, bit 13 } - } + }*/ #ifdef DCA_ENABLE // DCA_MODE = DCA 1.0 @@ -1666,10 +2179,11 @@ void ebbrt::IxgbeDriver::Init() { void ebbrt::IxgbeDriver::FinishSetup() { // No snoop disable from FreeBSD ?? - WriteCtrlExt(0x1 << 16); // NS_DIS - for (size_t i = 0; i < Cpu::Count(); i++) { + WriteCtrlExt(0x10000); + //WriteCtrlExt(0x1 << 16); // NS_DIS + /*for (size_t i = 0; i < Cpu::Count(); i++) { WriteDcaRxctrlClear(i, ~(0x1 << 12)); // clear bit 12 - } + }*/ WriteEims(0xFFFF); } @@ -1694,30 +2208,38 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); // program srrctl register - WriteSrrctlZero(i); + WriteSrrctl_1(i, 0x2000403); + /*WriteSrrctlZero(i); WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket WriteSrrctl_1(i, (128 / 64) << 8); // bsizeheader // TODO headsplit adv #ifdef RSC_EN WriteSrrctl_1(i, 0x1 << 25); // desctype adv +#endif #else // legacy is default?? WriteSrrctl_1(i, ~(0x7 << 25)); // desctype legacy -#endif + #endif - WriteSrrctl_1(i, 0x1 << 28); // Drop_En + WriteSrrctl_1(i, 0x1 << 28); // Drop_En*/ #ifdef RSC_EN // RSC set up WriteRscctl(i, 0x3 << 2); // MAXDESC WriteRscctl(i, 0x1); // RSCEN - WritePsrtypeZero(i); - WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. #endif // In NON-IOV, only psrtype[0] is used - WritePsrtype(0, 0x40001330); + if (i == 0) { + WritePsrtypeZero(0x1330); + } + //WritePsrtypeZero(i); + //WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. + + + //WritePsrtype(0, 0x40001330); + // Set head and tail pointers WriteRdt_1(i, 0x0); WriteRdh_1(i, 0x0); @@ -1735,14 +2257,17 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // don't set up interrupts for tx since we have head writeback?? auto qn = i / 2; // put into correct IVAR - if ((i % 2) == 0) { // check if 2xN or 2xN + 1 - WriteIvarAlloc0(qn, i); // rx interrupt allocation corresponds to index i * + WriteIvarAlloc0(qn, i | 0x1 << 7); // rx interrupt allocation corresponds to index i * // 2 in MSI-X table - WriteIvarAllocval0(qn, 0x1 << 7); + //WriteIvarAllocval0(qn, 0x1 << 7); + WriteIvarAlloc0(qn, i << 8 | 0x1 << 15); + //WriteIvarAllocval0(qn, 0x1 << 15); } else { - WriteIvarAlloc2(qn, i << 16); - WriteIvarAllocval2(qn, 0x1 << 23); + WriteIvarAlloc2(qn, i << 16 | 0x1 << 23); + //WriteIvarAllocval2(qn, 0x1 << 23); + WriteIvarAlloc2(qn, i << 24 | 0x1 << 31); + //WriteIvarAllocval2(qn, 0x1 << 31); } // must be greater than rsc delay @@ -1766,35 +2291,43 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // Enable RX // disable RX_DIS - WriteSecrxctrl_Rx_Dis(0x1 << 1); + //WriteSecrxctrl_Rx_Dis(0x1 << 1); + WriteSecrxctrl_Rx_Dis(0x3); // TODO Timeout while (ReadSecrxstat_Sr_Rdy() == 0) ; WriteRxctrl(0x1); - // enable RX_DIS - WriteSecrxctrl_Rx_Dis(0x0 << 1); + // enable RX_DIS, disable aes encryption offload, power savings + WriteSecrxctrl_Rx_Dis(0x1); // add buffer to each descriptor - for (size_t j = 0; j < NRXDESCS - 1; j++) { + for (size_t j = 0; j < NRXDESCS-1; j++) { auto rxphys = reinterpret_cast((ixgmq[i]->circ_buffer_[j])->MutData()); auto tail = ixgmq[i]->rx_tail_; // update buffer address for descriptor -#ifdef RSC_EN +/*#ifdef RSC_EN rdesc_adv_rf_t* tmp; tmp = reinterpret_cast(&(ixgmq[i]->rx_ring_[tail])); tmp->packet_buffer = rxphys; // TODO only use this if enabling header splitting? tmp->header_buffer = 0; -#else +#else*/ ixgmq[i]->rx_ring_[tail].buffer_address = rxphys; -#endif +//#endif + /*if(i == 0) { + ebbrt::kprintf("rx_ring_[tail=%u].buffer_address = 0x%X\n", tail, rxphys); + }*/ ixgmq[i]->rx_tail_ = (tail + 1) % ixgmq[i]->rx_size_; } + auto rxphys = + reinterpret_cast((ixgmq[i]->circ_buffer_[NRXDESCS-1])->MutData()); + ixgmq[i]->rx_ring_[ixgmq[i]->rx_tail_].buffer_address = rxphys; + // bump tail pts via register rdt to enable descriptor fetching by setting to // length of ring minus one WriteRdt_1(i, ixgmq[i]->rx_tail_); @@ -1810,6 +2343,9 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteDcaTxctrl(i, 0x1 << 5); // DCA Enable WriteDcaTxctrl(i, myapic << 24); // CPUID = apic id +//#else +// ReadDcaTxctrl(i); +// ReadDcaRxctrl(i); #endif // program base address registers @@ -1824,19 +2360,30 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteTdwbah(i, (ixgmq[i]->txhwbaddr_ >> 32) & 0xFFFFFFFF); #endif - // enable transmit path - WriteDmatxctl_te(0x1); + // enable transmit path: This step should be executed oly for the first enabled transmit queue and does + // not need to be repeated for any following queues. + if(i == 0) { + WriteDmatxctl_te(0x1); + } + WriteTdh(i, 0x0); + // transmit queue enable - WriteTxdctl(i, 0x1 << 25); - + //WriteTxdctl(i, 0x1 << 25); + //WriteTxdctl(i, 0x2010120); + WriteTxdctl(i, 0x2000000); + // poll until set, TODO: Timeout - while (ReadTxdctl_enable(i) == 0) - ; + while (ReadTxdctl_enable(i) == 0) { + ebbrt::clock::SleepMilli(1); + } + WriteTdt(i, 0x0); + ixgmq[i]->tx_tail_=0; + // TODO: set up dca txctrl FreeBSD? // clear TXdescWBROen - WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); + //WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); } // after packet received, need to make sure device can reuse @@ -1862,8 +2409,8 @@ void ebbrt::IxgbeDriverRep::ReclaimRx() { // may wait for RSC to be done uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, bool* process_rsc, - uint32_t* rnt) { -#ifdef RSC_EN + uint32_t* rnt, uint32_t* rxhead) { +//#ifdef RSC_EN rdesc_adv_wb_t* tmp; tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); @@ -1874,8 +2421,13 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, auto rsccnt = tmp->rsccnt; + // makes sure all reads are finished before + asm volatile("lfence":::"memory"); + + //ebbrt::kprintf("rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); + // not RSC, handled normally - if (rsccnt == 0 && tmp->eop) { + if (rsccnt == 0 && tmp->eop && tmp->dd) { *len = tmp->pkt_len; /* set rx flags */ @@ -1895,9 +2447,14 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, } } + *rxhead = ixgmq_.rx_head_; + //ebbrt::kprintf("\t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); + + //ebbrt::kprintf("\t rx_head=%d rsccnt=%d len=%d rss_type=0x%X rss_hash=0x%X\n", *rxhead, rsccnt, tmp->pkt_len, tmp->rss_type, tmp->rss_hash); + // reset descriptor - ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; - ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + //ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; + //ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; // bump head ptr ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; @@ -1945,7 +2502,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, ixgmq_.rsc_used = true; ixgmq_.rsc_chain_.clear(); ixgmq_.rsc_chain_.emplace_back( - std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); + std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); // bump head ptr ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; @@ -1999,7 +2556,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); } -#else +/*#else // no RSC so just get one packet at a time rdesc_legacy_t tmp; tmp = ixgmq_.rx_ring_[ixgmq_.rx_head_]; @@ -2007,7 +2564,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, if (tmp.dd && tmp.eop) { *len = tmp.length; - /* set rx flags */ + // set rx flags // TCP/UDP checksum if (tmp.l4cs) { *rxflag |= RXFLAG_L4CS; @@ -2035,7 +2592,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, #endif return 0; } -#endif + #endif*/ return 1; } @@ -2047,6 +2604,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { bool process_rsc; uint32_t count; uint32_t rnt; + uint32_t rxhead; process_rsc = false; #ifdef STATS_EN ixgmq_.stat_num_itr ++; @@ -2057,14 +2615,14 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint32_t mcore = static_cast(Cpu::GetMine()); // get address of buffer with data - while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt) == 0) { + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 0) { // hit last rsc context, start to process all buffers if (process_rsc) { process_rsc = false; count++; auto n = ixgmq_.rsc_chain_[0].first; - auto rsclen = 0; + uint32_t rsclen = 0; // TODO hack - need to set actual length of data else there'll be 0's // attached @@ -2087,11 +2645,59 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ReclaimRx(); - ebbrt::kprintf("\t ReceivePoll() RSC on core: %d len=%d\n", mcore, static_cast(rsclen)); - + /*if (len > 60) { + ebbrt::kprintf("\t RSC on core: %d len=%u\n", mcore, rsclen); + }*/ root_.itf_.Receive(std::move(b), rxflag); } else { - // done with buffer addr above, now to reuse it + //count ++; + + //ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d\n", mcore, len, rxhead); + + ixgmq_.circ_buffer_[rxhead]->SetLength(len); + auto b = std::move(ixgmq_.circ_buffer_[rxhead]); + + // bump tail ptr + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + + /*if (len > 60) { + ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d START\n", mcore, len, rxhead); + auto p1 = reinterpret_cast(b->MutData()); + int i=0; + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d END\n", mcore, len, rxhead); + for (int i = 0; i < (int)len; i+=8) { + if (i+8 < (int)len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } else { + for(int j = i; j < (int)len; j++) { + ebbrt::kprintf("%02X\n", p1[j]); + } + } + } + }*/ + /*if (len > 60) { + ebbrt::kprintf("\t ReceivePoll on core: %d len=%u\n", mcore, len); + }*/ + root_.itf_.Receive(std::move(b), rxflag); + + // reset buffer + ixgmq_.rx_ring_[rxhead].raw[0] = 0; + ixgmq_.rx_ring_[rxhead].raw[1] = 0; + // allocate new rx buffer + ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); + // update buffer with new adder + ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; + + // TODO: Update tail register here or above? + //if (count > 0) { + // update reg + WriteRdt_1(mcore, ixgmq_.rx_tail_); + //} + + /*// done with buffer addr above, now to reuse it auto tail = ixgmq_.rx_tail_; // bump tail ptr @@ -2117,25 +2723,23 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.rx_ring_[tail].buffer_address = rxphys; // dump eth packet info + //if(len > 1500 && len < 1600) { ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); - auto p1 = reinterpret_cast(b->MutData()); - for (int i = 0; i < (int)len; i+=8) { - if (i+8 < (int)len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } - else{ - ebbrt::kprintf("%02X\n", p1[i]); - } - } + + auto p1 = reinterpret_cast(b->MutData()); + for (int i = 0; i < (int)len; i+=8) { + if (i+8 < (int)len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + else{ + ebbrt::kprintf("%02X\n", p1[i]); + } + } + //} + root_.itf_.Receive(std::move(b), rxflag); - } - } - } - - // TODO: Update tail register here or above? - if (count > 0) { - // update reg - WriteRdt_1(mcore, ixgmq_.rx_tail_); + }*/ + } } } @@ -2143,7 +2747,7 @@ ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) : root_(root), ixgq_(root_.GetQueue()), ixgmq_(root.GetMultiQueue(Cpu::GetMine())), receive_callback_([this]() { ReceivePoll(); }) { - this->ReceivePoll(); + //this->ReceivePoll(); } uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { @@ -2180,3 +2784,11 @@ void ebbrt::IxgbeDriverRep::WriteEimc(uint32_t m) { root_.bar0_.Write32(0x00888, // 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) void ebbrt::IxgbeDriverRep::WriteEims(uint32_t m) { root_.bar0_.Write32(0x00880, m); } + +uint32_t ebbrt::IxgbeDriverRep::ReadTdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} +uint32_t ebbrt::IxgbeDriverRep::ReadTdt_1(uint32_t n) { + return root_.bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; +} diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 0bd6cfd1..468b9f79 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -101,8 +101,10 @@ class IxgbeDriver : public EthernetDevice { static const constexpr uint32_t NTXDESCS = 8192; static const constexpr uint32_t NRXDESCS = 8192; #else - static const constexpr uint32_t NTXDESCS = 512; - static const constexpr uint32_t NRXDESCS = 512; + //static const constexpr uint32_t NTXDESCS = 512; + //static const constexpr uint32_t NRXDESCS = 512; + static const constexpr uint32_t NTXDESCS = 128; + static const constexpr uint32_t NRXDESCS = 128; #endif // Linux Defaults @@ -128,8 +130,8 @@ class IxgbeDriver : public EthernetDevice { tx_last_tail_(0), tx_size_(NTXDESCS), idx_(idx), rxflag_(0), rsc_used(false), hanc{0} { - circ_buffer_.reserve(NRXDESCS); - for (uint32_t k = 0; k < NRXDESCS; k++) { + circ_buffer_.reserve(NRXDESCS+1); + for (uint32_t k = 0; k < NRXDESCS+1; k++) { circ_buffer_.emplace_back(MakeUniqueIOBuf(RXBUFSZ, true)); } @@ -138,7 +140,7 @@ class IxgbeDriver : public EthernetDevice { // packet data else code will read redundant // zeros if packet len does not use full buffer // TODO: should be optimized - rsc_chain_.reserve(NRXDESCS); + rsc_chain_.reserve(NRXDESCS+1); // RX ring buffer allocation auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); @@ -297,6 +299,7 @@ class IxgbeDriver : public EthernetDevice { void WriteEicr(uint32_t m); void WriteGpie(uint32_t m); + void WriteEiam(uint32_t n, uint32_t m); void WriteEims(uint32_t m); @@ -330,7 +333,9 @@ class IxgbeDriver : public EthernetDevice { void WriteDcaRxctrl_1(uint32_t n, uint32_t m); void WriteDcaRxctrl_2(uint32_t n, uint32_t m); void WriteDcaCtrl(uint32_t m); - + void ReadDcaTxctrl(uint32_t n); + void ReadDcaRxctrl(uint32_t n); + void WriteRdbal_1(uint32_t n, uint32_t m); void WriteRdbal_2(uint32_t n, uint32_t m); @@ -366,6 +371,7 @@ class IxgbeDriver : public EthernetDevice { void WriteTdh(uint32_t n, uint32_t m); void WriteTdt(uint32_t n, uint32_t m); + uint32_t ReadTdt(uint32_t n); void WriteTdwbal(uint32_t n, uint32_t m); void WriteTdwbah(uint32_t n, uint32_t m); @@ -499,8 +505,10 @@ class IxgbeDriverRep : public MulticoreEbb { void WriteEimcn(uint32_t n, uint32_t m); void WriteEimc(uint32_t m); void WriteEims(uint32_t m); + uint32_t ReadTdh_1(uint32_t n); + uint32_t ReadTdt_1(uint32_t n); uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, - bool* process_rsc, uint32_t* rnt); + bool* process_rsc, uint32_t* rnt, uint32_t* rxhead); const IxgbeDriver& root_; e10k_queue_t& ixgq_; diff --git a/src/native/Net.cc b/src/native/Net.cc index d884b7e0..f65bfe47 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -17,12 +17,13 @@ void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, auto packet_len = buf->ComputeChainDataLength(); // Drop packets that are too small - if (packet_len <= sizeof(EthernetHeader)) + if (packet_len <= sizeof(EthernetHeader)) { + ebbrt::kprintf("packet_len=%d too small (less than EthernetHeader)\n", packet_len); return; + } auto dp = buf->GetMutDataPointer(); auto& eth_header = dp.Get(); - buf->Advance(sizeof(EthernetHeader)); switch (ntohs(eth_header.type)) { @@ -34,6 +35,19 @@ void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, ReceiveArp(eth_header, std::move(buf)); break; } + default: { + ebbrt::kprintf("NetworkManager::Interface::Receive(): Unknown eth_header.type=0x%X packet_len=%u\n", ntohs(eth_header.type), packet_len); + ebbrt::kabort("NetworkManager::Interface::Receive()\n"); + /*for (int i = 0; i < (int)packet_len; i+=8) { + if (i+8 < (int)packet_len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } else { + for(int j = i; j < (int)packet_len; j++) { + ebbrt::kprintf("%02X\n", p1[j]); + } + } + }*/ + } } } diff --git a/src/native/Net.h b/src/native/Net.h index 33c501e9..65ad3595 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -46,6 +46,8 @@ struct PacketInfo { uint16_t gso_size{0}; uint16_t csum_start{0}; uint16_t csum_offset{0}; + uint32_t tcp_len{0}; + uint32_t tcp_hdr_len{0}; bool get_stats{false}; }; diff --git a/src/native/NetChecksum.cc b/src/native/NetChecksum.cc index bfd60c09..86cb4747 100644 --- a/src/native/NetChecksum.cc +++ b/src/native/NetChecksum.cc @@ -41,7 +41,7 @@ uint32_t Add32WithCarry(uint32_t a, uint32_t b) { } // Compute checksum over a contiguous region of memory -uint32_t Csum(const uint8_t* buf, size_t len, size_t offset = 0) { +uint64_t Csum(const uint8_t* buf, size_t len, size_t offset = 0) { if (unlikely(len == 0)) return 0; @@ -177,3 +177,7 @@ uint16_t ebbrt::IpCsum(const IOBuf& buf) { return CsumFold(IpCsumNoFold(buf)); } uint16_t ebbrt::IpCsum(const uint8_t* buf, size_t len) { return CsumFold(Csum(buf, len)); } + +uint32_t ebbrt::CsumTest(const IOBuf& buf) { + return IpCsumNoFold(buf); +} diff --git a/src/native/NetChecksum.h b/src/native/NetChecksum.h index 9274c8ee..55333e16 100644 --- a/src/native/NetChecksum.h +++ b/src/native/NetChecksum.h @@ -20,6 +20,7 @@ uint16_t IpPseudoCsum(const IOBuf& buf, uint8_t proto, Ipv4Address src, Ipv4Address dst); uint16_t IpCsum(const IOBuf& buf); uint16_t IpCsum(const uint8_t* buf, size_t len); +uint32_t CsumTest(const IOBuf& buf); } // namespace ebbrt #endif // BAREMETAL_SRC_INCLUDE_EBBRT_NETCHECKSUM_H_ diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index 3012637d..a03c616e 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -12,6 +12,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( EthernetHeader& eth_header, Ipv4Header& ip_header, std::unique_ptr buf) { auto packet_len = buf->ComputeChainDataLength(); + ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); if (unlikely(packet_len < sizeof(IcmpHeader))) return; @@ -19,6 +20,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( auto dp = buf->GetMutDataPointer(); auto& icmp_header = dp.Get(); + ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); #ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // software checksum if (IpCsum(*buf)) @@ -50,12 +52,12 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( pinfo.flags = 0; // hijacking ping to dump ixgbe statistics pinfo.get_stats = false; -#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ +//#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // hardware ip checksum offload - pinfo.flags |= PacketInfo::kNeedsIpCsum; -#else +// pinfo.flags |= PacketInfo::kNeedsIpCsum; +//#else ip_header.chksum = ip_header.ComputeChecksum(); -#endif +//#endif buf->Retreat(ip_header.HeaderLength()); diff --git a/src/native/NetIp.cc b/src/native/NetIp.cc index d320195b..8409e4ab 100644 --- a/src/native/NetIp.cc +++ b/src/native/NetIp.cc @@ -148,7 +148,7 @@ void ebbrt::NetworkManager::Interface::SendIp(std::unique_ptr buf, pinfo.csum_start += sizeof(Ipv4Header); pinfo.hdr_len += sizeof(Ipv4Header); - + EthArpSend(kEthTypeIp, ih, std::move(buf), pinfo); } diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index b9c304ea..11325f98 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -950,7 +950,7 @@ void ebbrt::NetworkManager::TcpEntry::EnqueueSegment( th.SetHdrLenFlags(sizeof(TcpHeader) + optlen, flags); // ackno, wnd, and checksum are set in Output() th.urgp = 0; - + pending_segments.emplace_back(std::move(buf), th, tcp_len); snd_nxt += tcp_len; @@ -1017,13 +1017,61 @@ void ebbrt::NetworkManager::TcpEntry::SendEmptyAck() { rcv_last_acked = rcv_nxt; th.ackno = htonl(rcv_nxt); th.wnd = htons(TcpWindow16(rcv_wnd)); - th.checksum = OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); - + th.checksum = 0; //OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); + + //auto local_ip = htonl(address.toU32()); + //auto remote_ip = htonl((std::get<0>(key)).toU32()); + + //ebbrt::kprintf("SendEmptyAck() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X\n", local_ip, remote_ip, kIpProtoTCP, 0, th.src_port, th.dst_port, th.seqno, th.ackno); + //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X tcp_header.checksum=0x%X \n", th.hdrlen_flags, th.wnd, th.urgp, th.checksum); + + /*auto pl = buf->ComputeChainDataLength(); + //ebbrt::kprintf("\t packet_len=%u\n", pl); + uint8_t* p1 = reinterpret_cast(buf->MutData()); + int i; + uint32_t sum = 0; + uint16_t word16; + for (i = 0; i < (int)pl; i+=2) { + word16 = ((p1[i]<<8)&0xFF00) + (p1[i+1]&0xFF); + sum = sum + (uint32_t)word16; + } + // pseudo header start + //add src addr + word16 = (local_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((local_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add dst addr + word16 = (remote_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((remote_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add protocol number and length of tcpp packet + sum = sum + kIpProtoTCP + pl; + // pseudo header end + + while(sum >> 16) { + sum = (sum & 0xFFFF) + (sum >> 16); + } + sum = (~sum) & 0xFFFF; + th.checksum = htons((uint16_t) sum); + //ebbrt::kprintf("\t new checksum=0x%X\n\n", th.checksum); + */ + PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - + + th.checksum = + OffloadPseudoCsum(*buf, kIpProtoTCP, address, std::get<0>(key)); + //OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); + pinfo.tcp_len = 0; + pinfo.tcp_hdr_len = th.HdrLen(); + //ebbrt::kprintf("SendEmptyAck() checksum=0x%X\n\n", th.checksum); + network_manager->SendIp(std::move(buf), address, std::get<0>(key), kIpProtoTCP, pinfo); } @@ -1076,30 +1124,120 @@ void ebbrt::NetworkManager::TcpEntry::SendFin() { // Actually send a segment via IP void ebbrt::NetworkManager::TcpEntry::SendSegment(TcpSegment& segment) { + uint32_t len = segment.buf->ComputeChainDataLength(); + uint32_t totallen = len + sizeof(Ipv4Header) + sizeof(EthernetHeader); rcv_last_acked = rcv_nxt; segment.th.ackno = htonl(rcv_nxt); segment.th.wnd = htons(TcpWindow16(rcv_wnd)); segment.th.checksum = 0; - // XXX: check if checksum offloading is supported - segment.th.checksum = + + // 82599 has a different checksum method when greater than a single MTU, the paylen field is set to 0 + if(totallen > 1490) { + segment.th.checksum = + OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); + } else { + // XXX: check if checksum offloading is supported + segment.th.checksum = OffloadPseudoCsum(*(segment.buf), kIpProtoTCP, address, std::get<0>(key)); + } PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header + //auto local_ip = htonl(address.toU32()); + //auto remote_ip = htonl((std::get<0>(key)).toU32()); + + //ebbrt::kprintf("SendSegment() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X htonl(ack_num)=0x%X\n", local_ip, remote_ip, kIpProtoTCP, segment.buf->ComputeChainDataLength(), segment.th.src_port, segment.th.dst_port, segment.th.seqno, segment.th.ackno, htonl(segment.th.ackno)); + //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X sizeof(TcpHeader)=%d tcp_header.checksum=0x%X\n", segment.th.hdrlen_flags, segment.th.wnd, 0, sizeof(TcpHeader), segment.th.checksum); + + /*uint32_t i, tmp, count, sum, len; + uint16_t word16; + + tmp = count = sum = len = 0x0; + + if(segment.buf->IsChained()) { + for (auto& buf_it : *(segment.buf)) { + auto p1 = reinterpret_cast(buf_it.Data()); + len += buf_it.Length(); + //if(b == 0 && p1[0] == 0x81 && p1[1] == 0x0) { +// b = 1; +// pinfo.flags |= PacketInfo::kNeedsCsum; +// } + for (i = 0; i < buf_it.Length(); i++) { + if (count < 2) { + tmp = tmp | (p1[i] << ((1-count) * 8)); + count ++; + + if(count == 2) { + sum += tmp; + tmp = count = 0x0; + } + } + } + } + } else { + uint8_t* p1 = reinterpret_cast(segment.buf->MutData()); + len = segment.buf->ComputeChainDataLength(); + for (i = 0; i < len; i++) { + if (count < 2) { + tmp = tmp | (p1[i] << ((1-count) * 8)); + count ++; + + if(count == 2) { + sum += tmp; + tmp = count = 0x0; + } + } + } + } + + sum += tmp; + + // pseudo header start + //add src addr + word16 = (local_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((local_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add dst addr + word16 = (remote_ip & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((remote_ip >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add protocol number + sum = sum + kIpProtoTCP + len; + + while(sum >> 16) { + sum = (sum & 0xFFFF) + (sum >> 16); + } + sum = (~sum) & 0xFFFF; + segment.th.checksum = htons((uint16_t) sum); + // pseudo header end + */ + //OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); + + pinfo.tcp_hdr_len = segment.th.HdrLen(); + pinfo.tcp_len = len - pinfo.tcp_hdr_len; + + //ebbrt::kprintf("\nSendSegment() total_len=%u len=%u tcp_hdr_len=%u tcp_len=%u checksum=0x%X\n", totallen, len, pinfo.tcp_hdr_len, pinfo.tcp_len, segment.th.checksum); + /*if(b) { + ebbrt::kprintf("len=%u checksum=0x%X\n\n", len, segment.th.checksum); + }*/ + // XXX: Actually store the MSS instead of making this assumption size_t mss = 1460; if (segment.tcp_len > mss) { pinfo.gso_type = PacketInfo::kGsoTcpv4; pinfo.hdr_len = segment.th.HdrLen(); pinfo.gso_size = mss; -#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ - segment.th.checksum = - OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); -#endif +//#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ + +//#endif } - + network_manager->SendIp(CreateRefChain(*(segment.buf)), address, std::get<0>(key), kIpProtoTCP, std::move(pinfo)); } @@ -1110,6 +1248,8 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, const Ipv4Address& remote_ip, uint16_t local_port, uint16_t remote_port) { + + ebbrt::kabort("ebbrt::NetworkManager::TcpReset() - Aborting haven't added checksum offloading\n"); auto buf = MakeUniqueIOBuf(sizeof(TcpHeader) + sizeof(Ipv4Header) + sizeof(EthernetHeader)); @@ -1125,9 +1265,12 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, tcp_header.SetHdrLenFlags(sizeof(TcpHeader), kTcpRst | (ack ? kTcpAck : 0)); tcp_header.wnd = htons(TcpWindow16(kTcpWnd)); tcp_header.urgp = 0; - tcp_header.checksum = - OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); + tcp_header.checksum = 0; + //OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); + //ebbrt::kprintf("TcpReset() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X\n", local_ip.toU32(), remote_ip.toU32(), kIpProtoTCP, 0, tcp_header.src_port, tcp_header.dst_port, tcp_header.seqno, tcp_header.ackno); + //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X tcp_header.checksum=0x%X \n\n", tcp_header.hdrlen_flags, tcp_header.wnd, tcp_header.urgp, tcp_header.checksum); + PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; // 14 byte eth header + 20 byte ip header diff --git a/src/native/NetUdp.cc b/src/native/NetUdp.cc index 992ee21f..60ba3a92 100644 --- a/src/native/NetUdp.cc +++ b/src/native/NetUdp.cc @@ -132,6 +132,10 @@ void ebbrt::NetworkManager::Interface::SendUdp(UdpPcb& pcb, Ipv4Address addr, src_addr = itf_addr->address; } + if(data_size % 2 > 0) { + throw std::runtime_error("SendUdp: data buffer size must be multiple of 2"); + } + // Get source port auto src_port = pcb.entry_->port; if (!src_port) @@ -148,12 +152,58 @@ void ebbrt::NetworkManager::Interface::SendUdp(UdpPcb& pcb, Ipv4Address addr, udp_header.length = htons(data_size + sizeof(UdpHeader)); udp_header.checksum = 0; + //uint64_t buff_addr = reinterpret_cast(buf->Data()); + // Append data header_buf->AppendChain(std::move(buf)); udp_header.checksum = - OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + //OffloadPseudoCsumTso(kIpProtoUDP, src_addr, addr); + // OffloadPseudoCsum(*header_buf, kIpProtoUDP, src_addr, addr); + + //ebbrt::kprintf("udp_header.checksum=0x%X udp_header.length=%d src_port=%d dst_port=%d kIpProtoUDP=0x%X src_addr=0x%llX dst_addr=0x%llX buf_len=%d\n", udp_header.checksum, data_size + sizeof(UdpHeader), src_port, port, kIpProtoUDP, src_addr.toU32(), addr.toU32(), data_size); + /*uint8_t* p1 = reinterpret_cast(buff_addr); + int i; + uint32_t sum = 0; + uint16_t word16; + for (i = 0; i < (int)data_size; i+=2) { + word16 = ((p1[i]<<8)&0xFF00) + (p1[i+1]&0xFF); + sum = sum + (uint32_t)word16; + } + // pseudo header start + //add src addr + word16 = (src_addr.toU32() & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((src_addr.toU32() >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + + //add dst addr + word16 = (addr.toU32() & 0xFFFF); + sum = sum + (uint32_t)word16; + word16 = ((addr.toU32() >> 16) & 0xFFFF); + sum = sum + (uint32_t)word16; + //sum = sum + (uint32_t)(addr.toU32()); + + //add protocol number and length of udp packet + sum = sum + kIpProtoUDP + (data_size + sizeof(UdpHeader)); + // pseudo header end + + // udp header start + // port + sum = sum + (uint32_t) src_port; + sum = sum + (uint32_t) port; + sum = sum + (uint32_t) (data_size + sizeof(UdpHeader)); + // udp header end + + while(sum >> 16) { + sum = (sum & 0xFFFF) + (sum >> 16); + } + sum = (~sum) & 0xFFFF; + udp_header.checksum = htons((uint16_t) sum); + //ebbrt::kprintf("real checksum? 0x%X\n", udp_header.checksum); + */ // XXX: check if checksum offloading is supported PacketInfo pinfo; pinfo.flags |= PacketInfo::kNeedsCsum; From e138668779b7e868044b5be8dc316d752e5ce566 Mon Sep 17 00:00:00 2001 From: Han Date: Wed, 18 Dec 2019 18:19:30 -0500 Subject: [PATCH 13/20] Added interface in Net.cc to configure NIC from application (Config), starting to add perf and other counters such as RAPL for counting power usage. --- src/native/IxgbeDriver.cc | 532 ++++++++++++++++++++++++-------------- src/native/IxgbeDriver.h | 61 +++-- src/native/Msr.h | 2 +- src/native/Net.cc | 8 + src/native/Net.h | 10 +- src/native/NetIcmp.cc | 6 +- src/native/Rapl.cc | 11 + src/native/Rapl.h | 127 +++++++++ src/native/VirtioNet.cc | 2 + src/native/VirtioNet.h | 1 + 10 files changed, 538 insertions(+), 222 deletions(-) create mode 100644 src/native/Rapl.cc create mode 100644 src/native/Rapl.h diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 961c985a..39ab6f4f 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -59,7 +59,7 @@ void ebbrt::IxgbeDriver::Create(pci::Device& dev) { // TODO remove? ebbrt::clock::SleepMilli(200); - ebbrt::kprintf("intel 82599 card initialzed\n"); + ebbrt::kprintf("82599 initialze complete\n"); } const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { @@ -67,50 +67,174 @@ const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { } void ebbrt::IxgbeDriver::DumpStats() { - for (size_t i = 0; i < Cpu::Count(); i++) { - ebbrt::kprintf("Core %d STATS:\n", (int)i); - ebbrt::kprintf("\t num_recv_itrs:%lld\n", ixgmq[i]->stat_num_itr); - ebbrt::kprintf("\t num_send:%lld\n", ixgmq[i]->stat_num_send); - ebbrt::kprintf("\t num_rx_desc_proc:%lld\n", ixgmq[i]->stat_num_rx); - ebbrt::kprintf("\t num_tx_desc_proc:%lld\n", ixgmq[i]->stat_num_tx); - - // reset to 0 - ixgmq[i]->stat_num_itr = 0; - ixgmq[i]->stat_num_send = 0; - ixgmq[i]->stat_num_rx = 0; - ixgmq[i]->stat_num_tx = 0; - - /*if(ixgmq[i]->stat_perf == false) { - ixgmq[i]->perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); - ixgmq[i]->perfCycles.Start(); - ixgmq[i]->perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); - ixgmq[i]->perfInst.Start(); - ixgmq[i]->stat_perf =true; + bool printout = false; + uint64_t tins, tcycs, tllc, numr, nums, numtxbytes, numrxbytes; + double ttime, tnrg, twatts; + + tins = tcycs = tllc = numr = nums = numtxbytes = numrxbytes = 0; + ttime = tnrg = twatts = 0.0; + + for(size_t i = 0; i < Cpu::Count(); i++) { + if(ixgmq[i]->stat_init == false) { + ixgmq[i]->stat_init = true; } else { + ixgmq[i]->stat_init = false; + printout = true; + + tcycs += ixgmq[i]->totalCycles; + tins += ixgmq[i]->totalIns; + tllc += ixgmq[i]->totalLLCmisses; + + numr += ixgmq[i]->stat_num_recv; + nums += ixgmq[i]->stat_num_send; + numrxbytes += ixgmq[i]->stat_num_rx_bytes; + numtxbytes += ixgmq[i]->stat_num_tx_bytes; + + //ebbrt::kprintf("DumpStats() Core %u \t cycles:%llu \n", i, ixgmq[i]->totalCycles); + //ebbrt::kprintf("\t instructions:%llu\n", ixgmq[i]->totalIns); + //ebbrt::kprintf("\t llc_misses:%llu\n", ixgmq[i]->totalLLCmisses); + //ebbrt::kprintf("\t num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); + + if(i == 0) { + ttime = ixgmq[i]->totalTime; + tnrg = ixgmq[i]->totalNrg; + twatts = tnrg / ttime; + + //ebbrt::kprintf("\t Total Time (s): %.2llf\n", ixgmq[i]->totalTime); + //ebbrt::kprintf("\t Total Energy (j): %.2llf\n", ixgmq[i]->totalNrg); + //ebbrt::kprintf("\t Power (Watts): %.2llf\n", ixgmq[i]->totalNrg/ixgmq[i]->totalTime); + } + ixgmq[i]->perfCycles.Stop(); ixgmq[i]->perfInst.Stop(); - double cyc = static_cast(ixgmq[i]->perfCycles.Read()); - double inst = static_cast(ixgmq[i]->perfInst.Read()); - - ebbrt::kprintf("Core %d PMC:\n", (int)i); - ebbrt::kprintf("\t cycles:%llf \n", cyc); - ebbrt::kprintf("\t instructions:%llf\n", inst); - ebbrt::kprintf("\t ipc: %llf\n", inst/cyc); - ixgmq[i]->stat_perf = false; - }*/ - } + ixgmq[i]->perfLLC_miss.Stop(); + + ixgmq[i]->stat_num_recv = 0; + ixgmq[i]->stat_num_send = 0; + ixgmq[i]->stat_num_rx_bytes = 0; + ixgmq[i]->stat_num_tx_bytes = 0; + + // accumulate counters + /*ixgmq[i]->totalCycles += static_cast(ixgmq[i]->perfCycles.Read()); + ixgmq[i]->totalIns += static_cast(ixgmq[i]->perfInst.Read()); + ixgmq[i]->totalLLCmisses += static_cast(ixgmq[i]->perfLLC_miss.Read()); + + ebbrt::kprintf("DumpStats() Core %u \t cycles:%llu \n", i, ixgmq[i]->totalCycles); + ebbrt::kprintf("\t instructions:%llu\n", ixgmq[i]->totalIns); + ebbrt::kprintf("\t llc_misses:%llu\n", ixgmq[i]->totalLLCmisses); + ebbrt::kprintf("\t num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); + + // clear + ixgmq[i]->perfCycles.Clear(); + ixgmq[i]->perfInst.Clear(); + ixgmq[i]->perfLLC_miss.Clear();*/ + } + } + + if(printout) { + ebbrt::kprintf("\t cycles:%llu\n", tcycs); + ebbrt::kprintf("\t instructions:%llu\n", tins); + ebbrt::kprintf("\t IPC:%.2llf\n", (double)tins/tcycs); + ebbrt::kprintf("\t llc_misses:%llu\n", tllc); + ebbrt::kprintf("\t num_recv:%llu\n", numr); + ebbrt::kprintf("\t num_send:%llu\n", nums); + ebbrt::kprintf("\t num_rx_bytes:%llu\n", numrxbytes); + ebbrt::kprintf("\t num_tx_bytes:%lld\n", numtxbytes); + ebbrt::kprintf("\t total_time:%.2llf\n", ttime); + ebbrt::kprintf("\t total_energy:%.2llf\n", tnrg); + ebbrt::kprintf("\t power:%.2llf\n", twatts); + } + + /*uint32_t i = static_cast(Cpu::GetMine()); + + if(ixgmq[i]->stat_perf == false) { + ixgmq[i]->perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + ixgmq[i]->perfCycles.Start(); + ixgmq[i]->perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + ixgmq[i]->perfInst.Start(); + ixgmq[i]->perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + ixgmq[i]->perfLLC_miss.Start(); + + if(i == 0) { + ixgmq[i]->powerMeter = ebbrt::rapl::RaplCounter(); + ixgmq[i]->powerMeter.Start(); + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); + } + //ebbrt::kprintf("\t Start Time (us): %llu\n", ixgmq[i]->time_us); + + ixgmq[i]->stat_perf =true; + } + else { + ixgmq[i]->perfCycles.Stop(); + ixgmq[i]->perfInst.Stop(); + ixgmq[i]->perfLLC_miss.Stop(); + + uint64_t cyc = static_cast(ixgmq[i]->perfCycles.Read()); + uint64_t inst = static_cast(ixgmq[i]->perfInst.Read()); + uint64_t llc = static_cast(ixgmq[i]->perfLLC_miss.Read()); + + ebbrt::kprintf("Core %u STATS: num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); + ebbrt::kprintf("\t cycles:%llu \n", cyc); + ebbrt::kprintf("\t instructions:%llu\n", inst); + ebbrt::kprintf("\t llc_misses:%llu\n", llc); + ebbrt::kprintf("\t ipc: %.2llf\n", (double)inst/cyc); + + if(i == 0) { + ixgmq[i]->powerMeter = ebbrt::rapl::RaplCounter(); + ixgmq[i]->powerMeter.Stop(); + double energyj = ixgmq[i]->powerMeter.Read(); + ebbrt::kprintf("\t Energy (j): %.2llf\n", energyj); + + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + uint64_t endt = std::chrono::duration_cast(d).count(); + double totaltime = (double)(endt - (ixgmq[i]->time_us)) / 1000000.0; + + //ebbrt::kprintf("\t End (us): %llu\n", endt); + ebbrt::kprintf("\t TotalTime (s): %.2llf\n", totaltime); + ebbrt::kprintf("\t Power (Watts): %.2llf\n", energyj/totaltime); + } + ixgmq[i]->stat_perf = false; + }*/ +} + +void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { + uint32_t i = static_cast(Cpu::GetMine()); + if(s == "rx_usecs") { + ebbrt::kprintf_force("rx-usecs = %u\n", v); + WriteEitr(i, (v << 3) | IXGBE_EITR_CNT_WDIS); + + } else if(s == "start_perf") { + ebb_->StartTimer(); + + } else if(s == "stop_perf") { + ebb_->StopTimer(); + + } else if(s == "print") { + ebbrt::kprintf_force("num_recvs=%u totalt(us) = %u\n", ixgmq[i]->stat_num_recv, ixgmq[i]->ttotalt); + + } else if(s == "clear") { + ixgmq[i]->stat_num_recv = 0; + ixgmq[i]->time_us = 0; + ixgmq[i]->ttotalt = 0; + + } + else { + + ebbrt::kprintf_force("%s Unknown command: %s\n", __PRETTY_FUNCTION__, s); + } } void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { -#ifdef STATS_EN +/*#ifdef STATS_EN if(pinfo.get_stats) { DumpStats(); } -#endif + #endif*/ ebb_->Send(std::move(buf), std::move(pinfo)); } -void ebbrt::IxgbeDriver::Run() { ebb_->Run(); } +//void ebbrt::IxgbeDriver::Run() { ebb_->StartTimer(); } // After packet transmission, need to mark bit in // tx queue so that it can be used again @@ -237,9 +361,9 @@ void ebbrt::IxgbeDriverRep::AddTx(uint64_t pa, uint64_t len, ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; -#ifdef STATS_EN +/*#ifdef STATS_EN ixgmq_.stat_num_tx ++; -#endif + #endif*/ } void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { @@ -248,14 +372,102 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { tdesc_advance_tx_rf_t* arfx; tdesc_advance_ctxt_wb_t* actx; uint32_t mcore = static_cast(Cpu::GetMine()); + //uint32_t free_desc = 0; //int i; - + // On TSO, the maximum PAYLEN can be up to 2^18 - 1 len = buf->ComputeChainDataLength(); if (len > 262144) { ebbrt::kprintf_force("\t kabort Send() len=%u greater than TSO limit of 262144 bytes\n", len); return; } + +/*#ifdef STATS_EN + // counter initialization, only need to do once + if(ixgmq_.stat_init == true && ixgmq_.stat_start == false) { + ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + ixgmq_.perfCycles.Start(); + ixgmq_.perfInst.Start(); + ixgmq_.perfLLC_miss.Start(); + + if(mcore == 0) { + ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); + ixgmq_.powerMeter.Start(); + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + ixgmq_.time_us = std::chrono::duration_cast(d).count(); + } + + ixgmq_.stat_start = true; + + // every 10000 sends + } else if (ixgmq_.stat_init == true && ixgmq_.stat_start == true && ixgmq_.stat_num_send % 10000 == 0) { + //stop counters + ixgmq_.perfCycles.Stop(); + ixgmq_.perfInst.Stop(); + ixgmq_.perfLLC_miss.Stop(); + ixgmq_.powerMeter.Stop(); + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + uint64_t endt = std::chrono::duration_cast(d).count(); + + // accumulate counters + ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); + ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); + ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); + ixgmq_.totalTime += ((double)(endt - (ixgmq_.time_us)) / 1000000.0); + ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); + + // clear + ixgmq_.perfCycles.Clear(); + ixgmq_.perfInst.Clear(); + ixgmq_.perfLLC_miss.Clear(); + + // restart again + ixgmq_.perfCycles.Start(); + ixgmq_.perfInst.Start(); + ixgmq_.perfLLC_miss.Start(); + ixgmq_.powerMeter.Start(); + auto dd = ebbrt::clock::Wall::Now().time_since_epoch(); + ixgmq_.time_us = std::chrono::duration_cast(dd).count(); + } + + ixgmq_.stat_num_send ++; + ixgmq_.stat_num_tx_bytes += len; + #endif*/ + + /*if(ixgmq_.tx_tail_ > ixgmq_.tx_head_) { + free_desc = IxgbeDriver::NTXDESCS - (ixgmq_.tx_tail_ - ixgmq_.tx_head_); + } else if(ixgmq_.tx_tail_ < ixgmq_.tx_head_){ + free_desc = IxgbeDriver::NTXDESCS - ((ixgmq_.tx_tail_+IxgbeDriver::NTXDESCS) - ixgmq_.tx_head_); + } else { + free_desc = IxgbeDriver::NTXDESCS; + } + + // 40 descriptors is theoretical limit of how many descriptors can be used at once + if(free_desc < 60) { + + // from first sent descriptor + for (auto rit = ixgmq_.send_to_watch.begin(); rit != ixgmq_.send_to_watch.end(); ++rit) { + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[*rit])); + + // Force memory writes to complete before letting h/w know there + // are new descriptors to fetch. (Only applicable for weak-ordered + // memory model archs, such as IA-64). + asm volatile("sfence" ::: "memory"); + + // wait until its sent + while(arfx->dd == 0) { + // makes sure all reads are finished before checking again + asm volatile("lfence":::"memory"); + } + + // increment head ptr + ixgmq_.tx_head_ = (*rit + 1) % ixgmq_.tx_size_; + //ebbrt::kprintf("\t core=%u Reclaimed *rit=%u head=%u\n", mcore, *rit, ixgmq_.tx_head_); + } + ixgmq_.send_to_watch.clear(); + }*/ if(buf->IsChained()) { b = MakeUniqueIOBuf(len); @@ -296,9 +508,10 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { arfx->dext = 1; //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - } - else { + } else { if(len > IXGBE_MAX_DATA_PER_TXD) { actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); actx->raw_1 = 0x0; @@ -326,7 +539,6 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { actx->l4len = pinfo.tcp_hdr_len; //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - //first descriptor arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); @@ -345,6 +557,7 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length arfx->paylen = pinfo.tcp_len; //ebbrt::kprintf("Send() first descriptor mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; tsodata = data; @@ -366,6 +579,7 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { if(tsolen > IXGBE_MAX_DATA_PER_TXD) { arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; //ebbrt::kprintf("Send() middle descriptor(s) mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } else { // last descriptor @@ -374,10 +588,11 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { arfx->rs = 1; //ebbrt::kprintf("Send() last descriptor mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, tsolen, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } - } - + } } else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); @@ -427,6 +642,8 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size arfx->paylen = pinfo.tcp_len; //ebbrt::kprintf("Send mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } else { actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); @@ -468,10 +685,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { arfx->dtyp = 0x3; arfx->eop = 1; - arfx->ifcs = 1; - arfx->rs = 1; - + arfx->ifcs = 1; + arfx->dext = 1; arfx->tse = 0; @@ -482,101 +698,27 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { arfx->txsm = 1; //} //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; + //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } } - /*if(len > 1448) { - //dumpPacketContents(reinterpret_cast(data), len); - //ebbrt::kabort("kabort Send()\n"); - actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - actx->raw_1 = 0x0; - actx->raw_2 = 0x0; - actx->iplen = IPHDR_LEN; - actx->maclen = ETHHDR_LEN; - // ip packet type = ipv4: 01 - actx->ipv4 = 1; - // l4type = tcp: 01 - actx->l4t = 1; - // for context descriptor 0b0010 - actx->dtyp = 0x2; - // descriptor extension, one for advanced mode - actx->dext = 1; - // from Linux - actx->mss = 1448; - // TCP header length, with no tcp options == 20 - actx->l4len = pinfo.tcp_hdr_len; - - ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); - - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - - arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - arfx->raw[0] = 0x0; - arfx->raw[1] = 0x0; - arfx->address = data; - - // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. - // Max length is 15.5 KB - arfx->dtalen = len; - arfx->dtyp = 0x3; - arfx->eop = 1; - arfx->rs = 1; - arfx->ifcs = 1; - arfx->dext = 1; - arfx->tse = 1; - - arfx->ixsm = 1; - arfx->txsm = 1; - // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size - arfx->paylen = pinfo.tcp_len; - - ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - //ebbrt::kabort("Exiting\n"); - - } else { - arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - arfx->raw[0] = 0x0; - arfx->raw[1] = 0x0; - - arfx->address = data; - - // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. - // Max length is 15.5 KB - arfx->dtalen = len; - - // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. - arfx->paylen = len; - - // crc checksum - arfx->ifcs = 1; - - // rs bit should only be set when eop is set - arfx->eop = 1; - arfx->rs = 1; - - // type is advanced - arfx->dtyp = 0x3; - arfx->dext = 1; - - ebbrt::kprintf("Send mcore=%u tail=%u tcp_hdr_len=%u tcp_len=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, pinfo.tcp_hdr_len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - }*/ + //ebbrt::kprintf("\t Send() core=%u head=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_tail_, free_desc); - // Force memory writes to complete before letting h/w know there - // are new descriptors to fetch. (Only applicable for weak-ordered - // memory model archs, such as IA-64). asm volatile("sfence" ::: "memory"); + //ebbrt::kprintf("\t Send() core=%u head=%u last_tail=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_last_tail_, ixgmq_.tx_tail_, free_desc); WriteTdt_1(mcore, ixgmq_.tx_tail_); - // keep looping until processed while(arfx->dd == 0) { // makes sure all reads are finished before checking again asm volatile("lfence":::"memory"); } + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + ixgmq_.time_us = std::chrono::duration_cast(d).count(); + //rtdh = ReadTdh_1(mcore); //rtdt = ReadTdt_1(mcore); //ebbrt::kprintf("\t Send() core=%u After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", mcore, len, rtdh, rtdt, ixgmq_.tx_tail_); @@ -2012,21 +2154,18 @@ void ebbrt::IxgbeDriver::Init() { // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to // hacky atm for (auto i = 0; i < 32; i += 4) { - if(ncore > 0) { - //WriteReta(i, 0x0000000); - //WriteReta(i+1, 0x0000000); - //WriteReta(i+2, 0x0000000); - //WriteReta(i+3, 0x0000000); + /*if(ncore > 0) { + WriteReta(i, 0x0000000); + WriteReta(i+1, 0x0000000); + WriteReta(i+2, 0x0000000); + WriteReta(i+3, 0x0000000); WriteReta(i, 0x03020100); WriteReta(i+1, 0x07060504); WriteReta(i+2, 0x0B0A0908); WriteReta(i+3, 0x0F0E0D0C); - } - /*if(ncore > 0) { - WriteReta(i, 0x01010101); }*/ // all route to core 0 - /*if(ncore == 1) { + if(ncore == 1) { WriteReta(i, 0x0000000); WriteReta(i+1, 0x0000000); WriteReta(i+2, 0x0000000); @@ -2046,49 +2185,15 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+1, 0x7060504); WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x7060504); + } else if(ncore == 16){ + WriteReta(i, 0x03020100); + WriteReta(i+1, 0x07060504); + WriteReta(i+2, 0x0B0A0908); + WriteReta(i+3, 0x0F0E0D0C); } else { - WriteReta(i, 0x3020100); //8 - WriteReta(i+1, 0x7060504); //8 - WriteReta(i+2, 0xB0A0908); // 8 - //WriteReta(i+3, 0x3020100); - //WriteReta(i+2, 0xB0A0908); - //WriteReta(i+3, 0xF0E0D0C); - }*/ + ebbrt::kabort("%s: Can only redirect interrupts to 16 cores\n", __FUNCTION__); + } } - - //temp -/* WriteReta(3, 0x3020100); - WriteReta(7, 0x7060504); - WriteReta(11, 0xB0A0908); - WriteReta(15, 0x3020100); - WriteReta(19, 0x7060504); - WriteReta(23, 0xB0A0908); - WriteReta(27, 0x3020100); - WriteReta(31, 0x7060504); - WriteReta(35, 0xB0A0908); - WriteReta(39, 0x3020100); - WriteReta(43, 0x7060504); - WriteReta(47, 0xB0A0908); - WriteReta(51, 0x3020100); - WriteReta(55, 0x7060504); - WriteReta(59, 0xB0A0908); - WriteReta(63, 0x3020100); - WriteReta(67, 0x7060504); - WriteReta(71, 0xB0A0908); - WriteReta(75, 0x3020100); - WriteReta(79, 0x7060504); - WriteReta(83, 0xB0A0908); - WriteReta(87, 0x3020100); - WriteReta(91, 0x7060504); - WriteReta(95, 0xB0A0908); - WriteReta(99, 0x3020100); - WriteReta(103, 0x7060504); - WriteReta(107, 0xB0A0908); - WriteReta(111, 0x3020100); - WriteReta(115, 0x7060504); - WriteReta(119, 0xB0A0908); - WriteReta(123, 0x3020100); - WriteReta(127, 0x3080400);*/ for (auto i = 0; i < 128; i++) { WriteFtqf(i, 0x0); @@ -2380,7 +2485,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteTdt(i, 0x0); ixgmq[i]->tx_tail_=0; - + // TODO: set up dca txctrl FreeBSD? // clear TXdescWBROen //WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); @@ -2414,9 +2519,9 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, rdesc_adv_wb_t* tmp; tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); - // if rx packet not ready + // if no rx packets ready if (!(tmp->dd)) { - return 1; + return 0; } auto rsccnt = tmp->rsccnt; @@ -2458,8 +2563,8 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, // bump head ptr ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - - return 0; + + return 1; } // not sure what case this is, no context started, eop is set but rsccnt > 0 else if (rsccnt > 0 && tmp->eop && !(ixgmq_.rsc_used)) { @@ -2492,7 +2597,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, // bump head ptr ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - return 0; + return 1; } // START NEW RSC CONTEXT else if (rsccnt > 0 && !(tmp->eop) && !(ixgmq_.rsc_used)) { @@ -2550,7 +2655,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, *process_rsc = true; - return 0; + return 1; } else { // shouldn't hit here ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); @@ -2594,7 +2699,7 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, } #endif*/ - return 1; + return 0; } void ebbrt::IxgbeDriverRep::ReceivePoll() { @@ -2606,16 +2711,30 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint32_t rnt; uint32_t rxhead; process_rsc = false; -#ifdef STATS_EN - ixgmq_.stat_num_itr ++; -#endif rxflag = 0; count = 0; rnt = 0; uint32_t mcore = static_cast(Cpu::GetMine()); +#ifdef STATS_EN + ixgmq_.stat_num_recv ++; +#endif - // get address of buffer with data - while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 0) { + if(ixgmq_.time_us == 0) { + //auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + //ixgmq_.time_us = std::chrono::duration_cast(d).count(); + } else { + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + uint64_t endt = std::chrono::duration_cast(d).count(); + ixgmq_.ttotalt += (endt - ixgmq_.time_us); + + //ebbrt::kprintf("Core %u: time elapsed (us): %llu\n", mcore, endt - ixgmq_.time_us); + + //auto dd = ebbrt::clock::Wall::Now().time_since_epoch(); + //ixgmq_.time_us = std::chrono::duration_cast(dd).count(); + } + + // while there are still packets received + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { // hit last rsc context, start to process all buffers if (process_rsc) { process_rsc = false; @@ -2650,15 +2769,20 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { }*/ root_.itf_.Receive(std::move(b), rxflag); } else { - //count ++; + count ++; //ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d\n", mcore, len, rxhead); + +#ifdef STATS_EN + ixgmq_.stat_num_rx_bytes += len; +#endif ixgmq_.circ_buffer_[rxhead]->SetLength(len); auto b = std::move(ixgmq_.circ_buffer_[rxhead]); // bump tail ptr ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + /*if (len > 60) { ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d START\n", mcore, len, rxhead); @@ -2690,13 +2814,8 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); // update buffer with new adder ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; - - // TODO: Update tail register here or above? - //if (count > 0) { - // update reg WriteRdt_1(mcore, ixgmq_.rx_tail_); - //} - + /*// done with buffer addr above, now to reuse it auto tail = ixgmq_.rx_tail_; @@ -2741,6 +2860,12 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { }*/ } } + + // TODO: Update tail register here or above? +// if (count > 0) { + // update reg + // WriteRdt_1(mcore, ixgmq_.rx_tail_); + //} } ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) @@ -2748,6 +2873,23 @@ ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) ixgmq_(root.GetMultiQueue(Cpu::GetMine())), receive_callback_([this]() { ReceivePoll(); }) { //this->ReceivePoll(); + /*auto timeout = + std::chrono::seconds(1); + timer->Start(*this, timeout,true);*/ +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StartTimer() { + auto timeout = std::chrono::seconds(1); + timer->Start(*this, timeout, true); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { + timer->Stop(*this); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { + uint32_t mcore = static_cast(Cpu::GetMine()); + ebbrt::kprintf_force("Core %u: Fire()\n", mcore); } uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 468b9f79..6bcfcd97 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -19,6 +19,7 @@ #include "Pfn.h" #include "SlabAllocator.h" #include "Perf.h" +#include "Rapl.h" // Receive Side Scaling (RSC) enabled //#define RSC_EN @@ -28,11 +29,12 @@ //#define TX_HEAD_WB // Collect Statistics Flag -//#define STATS_EN +#define STATS_EN //#define MAX_DESC -namespace ebbrt { +namespace ebbrt { + // Per-core receive and transmit queue typedef struct { rdesc_legacy_t* rx_ring; @@ -67,7 +69,7 @@ class IxgbeDriver : public EthernetDevice { // each core gets a queue struct ixgmq.resize(Cpu::Count()); } - + static void Create(pci::Device& dev); static bool Probe(pci::Device& dev) { if (dev.GetVendorId() == kIxgbeVendorId && @@ -77,12 +79,14 @@ class IxgbeDriver : public EthernetDevice { } return false; } - - void Run(); + + //void Run(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; + void Config(std::string s, uint32_t v) override; const EthernetAddress& GetMacAddress() override; protected: + static const constexpr uint16_t kIxgbeVendorId = 0x8086; static const constexpr uint16_t kIxgbeDeviceId = 0x10FB; @@ -101,10 +105,8 @@ class IxgbeDriver : public EthernetDevice { static const constexpr uint32_t NTXDESCS = 8192; static const constexpr uint32_t NRXDESCS = 8192; #else - //static const constexpr uint32_t NTXDESCS = 512; - //static const constexpr uint32_t NRXDESCS = 512; - static const constexpr uint32_t NTXDESCS = 128; - static const constexpr uint32_t NRXDESCS = 128; + static const constexpr uint32_t NTXDESCS = 64; + static const constexpr uint32_t NRXDESCS = 64; #endif // Linux Defaults @@ -114,7 +116,7 @@ class IxgbeDriver : public EthernetDevice { //static const constexpr uint32_t RXBUFSZ = 4096; //static const constexpr uint32_t RXBUFSZ = 16384; - static const constexpr uint8_t ITR_INTERVAL = 6; + static const constexpr uint8_t ITR_INTERVAL = 200; // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us static const constexpr uint8_t RSC_DELAY = 1; @@ -142,6 +144,11 @@ class IxgbeDriver : public EthernetDevice { // TODO: should be optimized rsc_chain_.reserve(NRXDESCS+1); + // keeps a log of descriptors where eop == 1 + // used to coalesce reclaiming of tx descriptors + // once the threshold of some limit is hit + send_to_watch.reserve(NRXDESCS); + // RX ring buffer allocation auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); auto order = Fls(sz - 1) - pmem::kPageShift + 1; @@ -198,7 +205,7 @@ class IxgbeDriver : public EthernetDevice { ebbrt::kbugon((tx_size_bytes_ & 0x7F) != 0, "tx_size_bytes_ not 128 byte aligned\n"); } - + size_t rx_head_; size_t rx_tail_; size_t rx_size_; @@ -215,7 +222,8 @@ class IxgbeDriver : public EthernetDevice { std::vector> circ_buffer_; std::vector> rsc_chain_; - + std::vector send_to_watch; + rdesc_legacy_t* rx_ring_; tdesc_legacy_t* tx_ring_; bool* tx_isctx_; @@ -228,18 +236,28 @@ class IxgbeDriver : public EthernetDevice { #endif // stats - uint64_t stat_num_itr{0}; + uint64_t stat_num_recv{0}; uint64_t stat_num_send{0}; - uint64_t stat_num_rx{0}; - uint64_t stat_num_tx{0}; - - bool stat_perf{false}; + uint64_t stat_num_rx_bytes{0}; + uint64_t stat_num_tx_bytes{0}; + uint64_t time_us{0}; + uint64_t ttotalt{0}; + uint64_t totalCycles{0}; + uint64_t totalIns{0}; + uint64_t totalLLCmisses{0}; + double totalNrg{0.0}; + double totalTime{0.0}; + + bool stat_start{false}; + bool stat_init{false}; ebbrt::perf::PerfCounter perfCycles; ebbrt::perf::PerfCounter perfInst; ebbrt::perf::PerfCounter perfLLC_ref; ebbrt::perf::PerfCounter perfLLC_miss; ebbrt::perf::PerfCounter perfTLB_store_miss; ebbrt::perf::PerfCounter perfTLB_load_miss; + ebbrt::rapl::RaplCounter powerMeter; + }; private: @@ -483,7 +501,7 @@ class IxgbeDriver : public EthernetDevice { friend class IxgbeDriverRep; }; // class IxgbeDriver -class IxgbeDriverRep : public MulticoreEbb { +class IxgbeDriverRep : public MulticoreEbb, Timer::Hook { public: explicit IxgbeDriverRep(const IxgbeDriver& root); void Run(); @@ -495,7 +513,9 @@ class IxgbeDriverRep : public MulticoreEbb { enum l4_type l4type); void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); - + void StartTimer(); + void StopTimer(); + private: uint16_t ReadRdh_1(uint32_t n); uint16_t ReadRdt_1(uint32_t n); @@ -509,7 +529,8 @@ class IxgbeDriverRep : public MulticoreEbb { uint32_t ReadTdt_1(uint32_t n); uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, bool* process_rsc, uint32_t* rnt, uint32_t* rxhead); - + void Fire() override; + const IxgbeDriver& root_; e10k_queue_t& ixgq_; IxgbeDriver::e10Kq& ixgmq_; diff --git a/src/native/Msr.h b/src/native/Msr.h index 9adc0699..a5358f4a 100644 --- a/src/native/Msr.h +++ b/src/native/Msr.h @@ -29,7 +29,7 @@ inline uint64_t Read(uint32_t index) { inline void Write(uint32_t index, uint64_t data) { uint32_t low = data; - uint32_t high = data >> 32; + uint32_t high = (data >> 32) & 0xFFFFFFFF; #ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // TODO - correct fix is here? diff --git a/src/native/Net.cc b/src/native/Net.cc index f65bfe47..c3f63e4a 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -59,3 +59,11 @@ void ebbrt::NetworkManager::Interface::Send(std::unique_ptr b, PacketInfo pinfo) { ether_dev_.Send(std::move(b), std::move(pinfo)); } + +void ebbrt::NetworkManager::Config(std::string s, uint32_t v) { + interface_->Config(std::move(s), v); +} + +void ebbrt::NetworkManager::Interface::Config(std::string s, uint32_t v) { + ether_dev_.Config(std::move(s), v); +} diff --git a/src/native/Net.h b/src/native/Net.h index 65ad3595..4125f2f2 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -55,6 +55,7 @@ class EthernetDevice { public: virtual void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()) = 0; + virtual void Config(std::string s, uint32_t v) = 0; virtual const EthernetAddress& GetMacAddress() = 0; virtual ~EthernetDevice() {} }; @@ -246,6 +247,8 @@ class NetworkManager : public StaticSharedEbb { std::unique_ptr buf); void SendIp(std::unique_ptr buf, Ipv4Address src, Ipv4Address dst, uint8_t proto, PacketInfo pinfo = PacketInfo()); + void Config(std::string s, uint32_t v); + const EthernetAddress& MacAddress(); const ItfAddress* Address() const { return address_.get(); } void SetAddress(std::unique_ptr address) { @@ -256,7 +259,7 @@ class NetworkManager : public StaticSharedEbb { private: struct DhcpPcb : public CacheAligned, public Timer::Hook { void Fire() override; - + UdpPcb udp_pcb; DhcpMessage last_offer; enum State { kInactive, kSelecting, kRequesting, kBound } state; @@ -311,9 +314,10 @@ class NetworkManager : public StaticSharedEbb { Interface& NewInterface(EthernetDevice& ether_dev); Ipv4Address IpAddress(); - + void Config(std::string s, uint32_t v); + private: - Future StartDhcp(); + Future StartDhcp(); void SendIp(std::unique_ptr buf, Ipv4Address src, Ipv4Address dst, uint8_t proto, PacketInfo = PacketInfo()); void TcpReset(bool ack, uint32_t seqno, uint32_t ackno, diff --git a/src/native/NetIcmp.cc b/src/native/NetIcmp.cc index a03c616e..623e7684 100644 --- a/src/native/NetIcmp.cc +++ b/src/native/NetIcmp.cc @@ -12,7 +12,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( EthernetHeader& eth_header, Ipv4Header& ip_header, std::unique_ptr buf) { auto packet_len = buf->ComputeChainDataLength(); - ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); + //ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); if (unlikely(packet_len < sizeof(IcmpHeader))) return; @@ -20,7 +20,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( auto dp = buf->GetMutDataPointer(); auto& icmp_header = dp.Get(); - ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); + //ebbrt::kprintf("ReceiveIcmp() packet_len=%u\n", packet_len); #ifndef __EBBRT_ENABLE_BAREMETAL_NIC__ // software checksum if (IpCsum(*buf)) @@ -51,7 +51,7 @@ void ebbrt::NetworkManager::Interface::ReceiveIcmp( PacketInfo pinfo; pinfo.flags = 0; // hijacking ping to dump ixgbe statistics - pinfo.get_stats = false; + pinfo.get_stats = true; //#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ // hardware ip checksum offload // pinfo.flags |= PacketInfo::kNeedsIpCsum; diff --git a/src/native/Rapl.cc b/src/native/Rapl.cc new file mode 100644 index 00000000..a1438b9d --- /dev/null +++ b/src/native/Rapl.cc @@ -0,0 +1,11 @@ +#include "Debug.h" +//#include "Msr.h" +#include "Rapl.h" + +ebbrt::rapl::RaplCounter::~RaplCounter() { + return; +} + +double ebbrt::rapl::RaplCounter::Read() { + return counter_offset; +} diff --git a/src/native/Rapl.h b/src/native/Rapl.h new file mode 100644 index 00000000..5bd2ca8d --- /dev/null +++ b/src/native/Rapl.h @@ -0,0 +1,127 @@ +// Copyright Boston University SESA Group 2013 - 2016. +// Distributed under the Boost Software License, Version 1.0. +// (See accompanying file LICENSE_1_0.txt or copy at +// http://www.boost.org/LICENSE_1_0.txt) +// +#ifndef BAREMETAL_SRC_INCLUDE_EBBRT_RAPL_H_ +#define BAREMETAL_SRC_INCLUDE_EBBRT_RAPL_H_ +#include +#include + +#include "Debug.h" +#include "Msr.h" +#include "Clock.h" + +namespace ebbrt { +namespace rapl { + const constexpr uint32_t kMsrIntelRaplPowerUnit = 0x606; + + /* Package RAPL Domain */ + const constexpr uint32_t kMsrPkgRaplPowerLimit = 0x610; + const constexpr uint32_t kMsrIntelPkgEnergyStatus = 0x611; + const constexpr uint32_t kMsrPkgPerfStatus = 0x613; + const constexpr uint32_t kMsrPkgPowerInfo = 0x614; + + /* PP0 RAPL Domain */ + const constexpr uint32_t kMsrPp0PowerLimit = 0x638; + const constexpr uint32_t kMsrIntelPp0EnergyStatus = 0x639; + const constexpr uint32_t kMsrPp0Policy = 0x63A; + const constexpr uint32_t kMsrPp0PerfStatus = 0x63B; + + /* PP1 RAPL Domain, may reflect to uncore devices */ + const constexpr uint32_t kMsrPp1PowerLimit = 0x640; + const constexpr uint32_t kMsrPp1EnergyStatus = 0x641; + const constexpr uint32_t kMsrPp1Polcy = 0x642; + + /* DRAM RAPL Domain */ + const constexpr uint32_t kMsrDramPowerLimit = 0x618; + const constexpr uint32_t kMsrDramEnergyStatus = 0x619; + const constexpr uint32_t kMsrDramPerfStatus = 0x61B; + const constexpr uint32_t kMsrDramPowerInfo = 0x61C; + + /* PSYS RAPL Domain */ + const constexpr uint32_t kMsrPlatformEnergyStatus = 0x64d; + + /* RAPL UNIT BITMASK */ + const constexpr uint32_t POWER_UNIT_OFFSET = 0; + const constexpr uint32_t POWER_UNIT_MASK = 0x0F; + + const constexpr uint32_t ENERGY_UNIT_OFFSET = 0x08; + const constexpr uint32_t ENERGY_UNIT_MASK = 0x1F00; + + const constexpr uint32_t TIME_UNIT_OFFSET = 0x10; + const constexpr uint32_t TIME_UNIT_MASK = 0xF000; + + class RaplCounter { //: public Timer::Hook { + public: + RaplCounter() { + uint64_t res = ebbrt::msr::Read(kMsrIntelRaplPowerUnit); + rapl_power_units = pow(0.5,(double)(res&0xf)); + rapl_cpu_energy_units = pow(0.5,(double)((res>>8)&0x1f)); + rapl_time_units = pow(0.5,(double)((res>>16)&0xf)); + rapl_dram_energy_units = rapl_cpu_energy_units; + }; + // move constructors + RaplCounter(RaplCounter&& other) { + rapl_power_units = other.rapl_power_units; + rapl_cpu_energy_units = other.rapl_cpu_energy_units; + rapl_time_units = other.rapl_time_units; + rapl_dram_energy_units = other.rapl_dram_energy_units; + + other.rapl_power_units = 0.0; + other.rapl_cpu_energy_units = 0.0; + other.rapl_time_units = 0.0; + other.rapl_dram_energy_units = 0.0; + }; + RaplCounter& operator=(RaplCounter&& other) { + rapl_power_units = other.rapl_power_units; + rapl_cpu_energy_units = other.rapl_cpu_energy_units; + rapl_time_units = other.rapl_time_units; + rapl_dram_energy_units = other.rapl_dram_energy_units; + + other.rapl_power_units = 0.0; + other.rapl_cpu_energy_units = 0.0; + other.rapl_time_units = 0.0; + other.rapl_dram_energy_units = 0.0; + + return *this; + }; + + // delete implicitly created copy constructor + RaplCounter(const RaplCounter& other) = delete; + RaplCounter& operator=(const RaplCounter& other) = delete; + + ~RaplCounter(); + void Start() { + uint64_t res = ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + counter_offset = (double)res*rapl_cpu_energy_units; + + /*ebbrt::kprintf("\t\tPower units = %.3fW\n",rapl_power_units); + ebbrt::kprintf("\t\tCPU Energy units = %.8fJ\n",rapl_cpu_energy_units); + ebbrt::kprintf("\t\tDRAM Energy units = %.8fJ\n",rapl_dram_energy_units); + ebbrt::kprintf("\t\tTime units = %.8fs\n",rapl_time_units); */ + //ebbrt::kprintf("Package Energy before: %.6fJ\n", counter_offset); + } + + void Stop() { + uint64_t res = ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + double after = (double)res*rapl_cpu_energy_units; + //ebbrt::kprintf("Package Energy after: %.6fJ\n", after); + counter_offset = after - counter_offset; + //ebbrt::kprintf("Total Package Energy used: %.6fJ\n", after - counter_offset); + } + + double Read(); + private: + double rapl_power_units{0.0}; + double rapl_cpu_energy_units{0.0}; + double rapl_time_units{0.0}; + double rapl_dram_energy_units{0.0}; + double counter_offset{0.0}; + //void Fire() override; + }; + +} // namespace rapl +} // namespace ebbrt + +#endif diff --git a/src/native/VirtioNet.cc b/src/native/VirtioNet.cc index 61064a2f..07c982e5 100644 --- a/src/native/VirtioNet.cc +++ b/src/native/VirtioNet.cc @@ -164,6 +164,8 @@ ebbrt::VirtioNetRep::VirtioNetRep(const VirtioNetDriver& root) receive_callback_([this]() { ReceivePoll(); }), circ_buffer_head_(0), circ_buffer_tail_(0) {} +void ebbrt::VirtioNetDriver::Config(std::string s, uint32_t v) {} + void ebbrt::VirtioNetDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { ebb_->Send(std::move(buf), std::move(pinfo)); diff --git a/src/native/VirtioNet.h b/src/native/VirtioNet.h index 63b616b5..dac8b043 100644 --- a/src/native/VirtioNet.h +++ b/src/native/VirtioNet.h @@ -24,6 +24,7 @@ class VirtioNetDriver : public VirtioDriver, static void Create(pci::Device& dev); static uint32_t GetDriverFeatures(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; + void Config(std::string s, uint32_t v) override; const EthernetAddress& GetMacAddress() override; private: From 9fabb7a3edb1acdf63f0c36b20814aeb8f17ab9e Mon Sep 17 00:00:00 2001 From: Han Date: Mon, 20 Jan 2020 15:55:22 -0500 Subject: [PATCH 14/20] added RAPL power limiting --- src/native/EventManager.cc | 28 +- src/native/EventManager.h | 3 +- src/native/IxgbeDriver.cc | 782 ++++++++++--------------------------- src/native/IxgbeDriver.h | 51 ++- src/native/Net.cc | 8 + src/native/Net.h | 3 + src/native/NetTcp.cc | 93 +---- src/native/Rapl.h | 39 ++ src/native/VirtioNet.cc | 1 + src/native/VirtioNet.h | 1 + 10 files changed, 323 insertions(+), 686 deletions(-) diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index dc1f7f2f..79829ba8 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -105,11 +105,13 @@ SwitchStack(uintptr_t first_param, uintptr_t stack, void (*func)(uintptr_t)); void ebbrt::EventManager::StartProcessingEvents() { auto stack_top = (active_event_context_.stack + kStackPages).ToAddr(); + //ebbrt::kprintf_force("StartProcessingEvents()\n"); SwitchStack(reinterpret_cast(this), stack_top, CallProcess); } void ebbrt::EventManager::CallProcess(uintptr_t mgr) { auto pmgr = reinterpret_cast(mgr); + //ebbrt::kprintf_force("CallProcess()\n"); pmgr->Process(); } @@ -130,6 +132,9 @@ template void ebbrt::EventManager::InvokeFunction(F&& f) { void ebbrt::EventManager::Process() { auto stack_top = (active_event_context_.stack + kStackPages).ToAddr(); Cpu::GetMine().SetEventStack(stack_top); + unsigned long ecx, edx, eax; + ecx = edx = eax = 0; + // process an interrupt without halting // the sti instruction starts processing interrupts *after* the next // instruction is executed (to allow for a halt for example). The nop gives us @@ -141,6 +146,7 @@ void ebbrt::EventManager::Process() { // If an interrupt was processed then we would not reach this code (the // interrupt does not return here but instead to the top of this function) + //ebbrt::kprintf_force("p1\n"); if (!tasks_.empty()) { auto f = std::move(tasks_.front()); tasks_.pop_front(); @@ -153,9 +159,20 @@ void ebbrt::EventManager::Process() { InvokeFunction(*idle_callback_); goto process; } - + + asm volatile(".byte 0x0f, 0x01, 0xc8;" + :: "a" ((void*)&flags), "c" (ecx), "d"(edx)); + ecx = 1; + + //eax = 0x20; + eax = 0x60; + //eax = 0x30; + + asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" + :: "a" (eax), "c" (ecx)); + asm volatile("sti;" - "hlt;"); + "hlt;"); kabort("Woke up from halt?!?!"); } @@ -204,6 +221,7 @@ void ebbrt::EventManager::CallSync(uintptr_t mgr) { // "fresh" event. Therefore if the sync_contexts_ stack is empty, we just go // back to the event loop if (unlikely(pmgr->sync_contexts_.empty())) { + //ebbrt::kprintf_force("CallSync, Process()\n"); pmgr->Process(); } else { // save this stack @@ -340,6 +358,7 @@ void ebbrt::EventManager::ProcessInterrupt(int num) { auto& f = ih->func; InvokeFunction(f); } + //ebbrt::kprintf_force("ProcessInterrupt %d\n", num); Process(); } @@ -410,12 +429,15 @@ void ebbrt::EventManager::Fire() { tasks.pop(); } } else { + //ebbrt::kprintf("EventManager()::Fire()\n"); StartTimer(); } } void ebbrt::EventManager::StartTimer() { - timer->Start(*this, std::chrono::milliseconds(1), + //timer->Start(*this, std::chrono::milliseconds(2), + // /* repeat = */ false); + timer->Start(*this, std::chrono::microseconds(1500), /* repeat = */ false); } diff --git a/src/native/EventManager.h b/src/native/EventManager.h index 2f40e1fa..ba3195c8 100644 --- a/src/native/EventManager.h +++ b/src/native/EventManager.h @@ -112,7 +112,8 @@ class EventManager : Timer::Hook { size_t pending_generation_ = 0; std::queue> prev_rcu_tasks_; std::queue> curr_rcu_tasks_; - + unsigned long flags; + struct RemoteData : CacheAligned { ebbrt::SpinLock lock; std::list> tasks; diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 39ab6f4f..f5c25de6 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -66,181 +66,135 @@ const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { return mac_addr_; } -void ebbrt::IxgbeDriver::DumpStats() { - bool printout = false; - uint64_t tins, tcycs, tllc, numr, nums, numtxbytes, numrxbytes; - double ttime, tnrg, twatts; - - tins = tcycs = tllc = numr = nums = numtxbytes = numrxbytes = 0; - ttime = tnrg = twatts = 0.0; - - for(size_t i = 0; i < Cpu::Count(); i++) { - if(ixgmq[i]->stat_init == false) { - ixgmq[i]->stat_init = true; - } else { - ixgmq[i]->stat_init = false; - printout = true; - - tcycs += ixgmq[i]->totalCycles; - tins += ixgmq[i]->totalIns; - tllc += ixgmq[i]->totalLLCmisses; +std::string ebbrt::IxgbeDriver::ReadNic() { + uint32_t i = static_cast(Cpu::GetMine()); + return ixgmq[i]->str_stats.str(); +} - numr += ixgmq[i]->stat_num_recv; - nums += ixgmq[i]->stat_num_send; - numrxbytes += ixgmq[i]->stat_num_rx_bytes; - numtxbytes += ixgmq[i]->stat_num_tx_bytes; - - //ebbrt::kprintf("DumpStats() Core %u \t cycles:%llu \n", i, ixgmq[i]->totalCycles); - //ebbrt::kprintf("\t instructions:%llu\n", ixgmq[i]->totalIns); - //ebbrt::kprintf("\t llc_misses:%llu\n", ixgmq[i]->totalLLCmisses); - //ebbrt::kprintf("\t num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); - - if(i == 0) { - ttime = ixgmq[i]->totalTime; - tnrg = ixgmq[i]->totalNrg; - twatts = tnrg / ttime; - - //ebbrt::kprintf("\t Total Time (s): %.2llf\n", ixgmq[i]->totalTime); - //ebbrt::kprintf("\t Total Energy (j): %.2llf\n", ixgmq[i]->totalNrg); - //ebbrt::kprintf("\t Power (Watts): %.2llf\n", ixgmq[i]->totalNrg/ixgmq[i]->totalTime); - } - - ixgmq[i]->perfCycles.Stop(); - ixgmq[i]->perfInst.Stop(); - ixgmq[i]->perfLLC_miss.Stop(); - - ixgmq[i]->stat_num_recv = 0; - ixgmq[i]->stat_num_send = 0; - ixgmq[i]->stat_num_rx_bytes = 0; - ixgmq[i]->stat_num_tx_bytes = 0; - - // accumulate counters - /*ixgmq[i]->totalCycles += static_cast(ixgmq[i]->perfCycles.Read()); - ixgmq[i]->totalIns += static_cast(ixgmq[i]->perfInst.Read()); - ixgmq[i]->totalLLCmisses += static_cast(ixgmq[i]->perfLLC_miss.Read()); - - ebbrt::kprintf("DumpStats() Core %u \t cycles:%llu \n", i, ixgmq[i]->totalCycles); - ebbrt::kprintf("\t instructions:%llu\n", ixgmq[i]->totalIns); - ebbrt::kprintf("\t llc_misses:%llu\n", ixgmq[i]->totalLLCmisses); - ebbrt::kprintf("\t num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); - - // clear - ixgmq[i]->perfCycles.Clear(); - ixgmq[i]->perfInst.Clear(); - ixgmq[i]->perfLLC_miss.Clear();*/ +void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { + uint32_t i = static_cast(Cpu::GetMine()); + if(s == "rx_usecs") { + ixgmq[i]->itr_val = v; + ebbrt::kprintf_force("rx-usecs = %u\n", ixgmq[i]->itr_val*2); + WriteEitr(i, (ixgmq[i]->itr_val << 3) | IXGBE_EITR_CNT_WDIS); + } else if(s == "rapl") { + if(i == 0 || i == 1) { + ixgmq[i]->powerMeter.SetLimit(v); } - } - - if(printout) { - ebbrt::kprintf("\t cycles:%llu\n", tcycs); - ebbrt::kprintf("\t instructions:%llu\n", tins); - ebbrt::kprintf("\t IPC:%.2llf\n", (double)tins/tcycs); - ebbrt::kprintf("\t llc_misses:%llu\n", tllc); - ebbrt::kprintf("\t num_recv:%llu\n", numr); - ebbrt::kprintf("\t num_send:%llu\n", nums); - ebbrt::kprintf("\t num_rx_bytes:%llu\n", numrxbytes); - ebbrt::kprintf("\t num_tx_bytes:%lld\n", numtxbytes); - ebbrt::kprintf("\t total_time:%.2llf\n", ttime); - ebbrt::kprintf("\t total_energy:%.2llf\n", tnrg); - ebbrt::kprintf("\t power:%.2llf\n", twatts); - } - - /*uint32_t i = static_cast(Cpu::GetMine()); - - if(ixgmq[i]->stat_perf == false) { - ixgmq[i]->perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + } else if(s == "start_perf") { + ixgmq[i]->stat_num_recv = 0; + ixgmq[i]->time_us = 0; + ixgmq[i]->totalCycles = 0; + ixgmq[i]->totalIns = 0; + ixgmq[i]->totalLLCmisses = 0; + ixgmq[i]->totalNrg = 0; + ixgmq[i]->perfCycles.Start(); - ixgmq[i]->perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); ixgmq[i]->perfInst.Start(); - ixgmq[i]->perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); ixgmq[i]->perfLLC_miss.Start(); - - if(i == 0) { - ixgmq[i]->powerMeter = ebbrt::rapl::RaplCounter(); - ixgmq[i]->powerMeter.Start(); + if(i == 0 || i == 1) { auto d = ebbrt::clock::Wall::Now().time_since_epoch(); ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); + ixgmq[i]->powerMeter.Start(); } - //ebbrt::kprintf("\t Start Time (us): %llu\n", ixgmq[i]->time_us); - - ixgmq[i]->stat_perf =true; - } - else { + ebb_->StartTimer(); + + } else if(s == "stop_perf") { ixgmq[i]->perfCycles.Stop(); ixgmq[i]->perfInst.Stop(); ixgmq[i]->perfLLC_miss.Stop(); - - uint64_t cyc = static_cast(ixgmq[i]->perfCycles.Read()); - uint64_t inst = static_cast(ixgmq[i]->perfInst.Read()); - uint64_t llc = static_cast(ixgmq[i]->perfLLC_miss.Read()); - - ebbrt::kprintf("Core %u STATS: num_recv:%lld num_send:%lld num_rx_bytes:%lld num_tx_bytes=%lld\n", i, ixgmq[i]->stat_num_recv, ixgmq[i]->stat_num_send, ixgmq[i]->stat_num_rx_bytes, ixgmq[i]->stat_num_tx_bytes); - ebbrt::kprintf("\t cycles:%llu \n", cyc); - ebbrt::kprintf("\t instructions:%llu\n", inst); - ebbrt::kprintf("\t llc_misses:%llu\n", llc); - ebbrt::kprintf("\t ipc: %.2llf\n", (double)inst/cyc); - - if(i == 0) { - ixgmq[i]->powerMeter = ebbrt::rapl::RaplCounter(); + if(i == 0 || i == 1) { ixgmq[i]->powerMeter.Stop(); - double energyj = ixgmq[i]->powerMeter.Read(); - ebbrt::kprintf("\t Energy (j): %.2llf\n", energyj); + } + // accumulate counters + ixgmq[i]->totalCycles += static_cast(ixgmq[i]->perfCycles.Read()); + ixgmq[i]->totalIns += static_cast(ixgmq[i]->perfInst.Read()); + ixgmq[i]->totalLLCmisses += static_cast(ixgmq[i]->perfLLC_miss.Read()); + ixgmq[i]->totalInterrupts = ixgmq[i]->stat_num_recv; + if(i == 0 || i == 1) { + ixgmq[i]->totalNrg += ixgmq[i]->powerMeter.Read(); auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - uint64_t endt = std::chrono::duration_cast(d).count(); - double totaltime = (double)(endt - (ixgmq[i]->time_us)) / 1000000.0; - - //ebbrt::kprintf("\t End (us): %llu\n", endt); - ebbrt::kprintf("\t TotalTime (s): %.2llf\n", totaltime); - ebbrt::kprintf("\t Power (Watts): %.2llf\n", energyj/totaltime); + auto endt = std::chrono::duration_cast(d).count(); + ixgmq[i]->totalTime = ((double)(endt - (ixgmq[i]->time_us)) / 1000000.0); + //ixgmq[i]->totalPower = ixgmq[i]->totalNrg / ixgmq[i]->totalTime; + //ebbrt::kprintf_force("Core %u: cycles=%llu ins=%llu llc=%llu energy=%.2lfJ totalTime=%.2f secs Power (Watts): %.2lf\n", i, ixgmq[i]->totalCycles, ixgmq[i]->totalIns, ixgmq[i]->totalLLCmisses, ixgmq[i]->totalNrg, totalTime, ); } - ixgmq[i]->stat_perf = false; - }*/ -} - -void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { - uint32_t i = static_cast(Cpu::GetMine()); - if(s == "rx_usecs") { - ebbrt::kprintf_force("rx-usecs = %u\n", v); - WriteEitr(i, (v << 3) | IXGBE_EITR_CNT_WDIS); + ixgmq[i]->perfCycles.Clear(); + ixgmq[i]->perfInst.Clear(); + ixgmq[i]->perfLLC_miss.Clear(); + ixgmq[i]->stat_num_recv = 0; - } else if(s == "start_perf") { - ebb_->StartTimer(); + ebb_->StopTimer(); - } else if(s == "stop_perf") { - ebb_->StopTimer(); - } else if(s == "print") { - ebbrt::kprintf_force("num_recvs=%u totalt(us) = %u\n", ixgmq[i]->stat_num_recv, ixgmq[i]->ttotalt); + uint64_t cycs, ins, llc, nints; + double ttime, tnrg; + ttime = tnrg = 0.0; + cycs = ins = llc = nints = 0; + + for(uint32_t i = 0; i < static_cast(Cpu::Count()); i++) { + cycs += ixgmq[i]->totalCycles; + ins += ixgmq[i]->totalIns; + llc += ixgmq[i]->totalLLCmisses; + tnrg += ixgmq[i]->totalNrg; + nints += ixgmq[i]->totalInterrupts; + } + ttime = ixgmq[0]->totalTime > ixgmq[1]->totalTime ? ixgmq[0]->totalTime : ixgmq[1]->totalTime; + + ixgmq[i]->str_stats.str(""); + ixgmq[i]->str_stats.precision(20); + ixgmq[i]->str_stats << "INSTRUCTIONS=" << ins + << " CYCLES=" << cycs + << " IPC=" << (float)ins/cycs + << " LLC_MISSES=" << llc + << " TIME=" << ttime + << " WATTS=" << tnrg/ttime + << " AVG_ITR_PER_CORE=" << (float)nints/static_cast(Cpu::Count()) + << " ITR=" << ixgmq[i]->itr_val * 2 + << " RAPL=" << ixgmq[i]->rapl_val; + - } else if(s == "clear") { - ixgmq[i]->stat_num_recv = 0; - ixgmq[i]->time_us = 0; - ixgmq[i]->ttotalt = 0; + ebbrt::kprintf_force("INSTRUCTIONS=%llu\n", ins); + ebbrt::kprintf_force("CYCLES=%llu\n", cycs); + ebbrt::kprintf_force("IPC=%.2f\n", (float)ins/cycs); + ebbrt::kprintf_force("LLC_MISSES=%llu\n", llc); + ebbrt::kprintf_force("TIME=%.2f\n", ttime); + ebbrt::kprintf_force("WATTS=%.2f\n", tnrg/ttime); + ebbrt::kprintf_force("AVG_ITR_PER_CORE=%.2f\n", (float)nints/static_cast(Cpu::Count())); + ebbrt::kprintf_force("\n"); + + //ebbrt::kprintf_force("nrg=%.2lf J\n", tnrg); + //ebbrt::kprintf_force("ttime=%.2f s time1=%.2f s time2=%.2f s\n", ttime, ixgmq[0]->totalTime, ixgmq[1]->totalTime); - } - else { + } else if(s == "start_idle") { + ixgmq[i]->time_send = 0; + ixgmq[i]->time_idle_min = 999999; + ixgmq[i]->time_idle_max = 0; + ixgmq[i]->total_idle_time = 0; + ixgmq[i]->stat_num_recv = 0; + ixgmq[i]->idle_times_.clear(); + } else if(s == "stop_idle") { + ebbrt::kprintf_force("Core %u: idle_min=%llu idle_max=%llu stat_num_recv=%llu avg_idle=%.2lf\n", i, ixgmq[i]->time_idle_min, ixgmq[i]->time_idle_max, ixgmq[i]->stat_num_recv, (double)ixgmq[i]->total_idle_time/ixgmq[i]->stat_num_recv); + /*for(const auto& n : ixgmq[i]->idle_times_) { + ebbrt::kprintf_force("%u: %u\n", n.first, n.second); + }*/ + } else { ebbrt::kprintf_force("%s Unknown command: %s\n", __PRETTY_FUNCTION__, s); + } } void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { -/*#ifdef STATS_EN - if(pinfo.get_stats) { - DumpStats(); - } - #endif*/ ebb_->Send(std::move(buf), std::move(pinfo)); } -//void ebbrt::IxgbeDriver::Run() { ebb_->StartTimer(); } - // After packet transmission, need to mark bit in // tx queue so that it can be used again // TX_HEAD_WB does it automatically void ebbrt::IxgbeDriverRep::ReclaimTx() { -#ifndef TX_HEAD_WB +/*#ifndef TX_HEAD_WB size_t head = ixgmq_.tx_head_; size_t tail = ixgmq_.tx_tail_; tdesc_advance_tx_wbf_t* actx; @@ -250,8 +204,9 @@ void ebbrt::IxgbeDriverRep::ReclaimTx() { actx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); // if context - if (ixgmq_.tx_isctx_[head]) { + if (ixgmq_.tx_isctx[head]) { head = (head + 1) % ixgmq_.tx_size_; + ixgmq_.tx_isctx[head] = false; } // if non eop else if (!(actx->dd)) { @@ -263,106 +218,6 @@ void ebbrt::IxgbeDriverRep::ReclaimTx() { ixgmq_.tx_head_ = head; } } -#endif -} - -// every TX requires a context struct before -void ebbrt::IxgbeDriverRep::AddContext(uint8_t idx, uint8_t maclen, - uint16_t iplen, uint8_t l4len, - enum l4_type l4type) { - - tdesc_advance_ctxt_wb_t* actx; - - auto tail = ixgmq_.tx_tail_; - - // context buffer already allocated, need to zero - actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); - - actx->raw_1 = 0x0; - actx->raw_2 = 0x0; - - memset(actx, 0, sizeof(tdesc_advance_ctxt_wb_t)); - ixgmq_.tx_isctx_[tail] = true; - - // refer to 82599 datasheet for these settings - actx->iplen = iplen; - actx->maclen = maclen; - - actx->dtyp = 0b0010; - actx->dext = 1; - actx->idx = idx; - - actx->ipv4 = 1; - //actx->l4len = 0; // ignored when TSE not set - actx->l4len = 0x14; // TSE for TCP is 20 - actx->l4t = l4type; - - actx->mss = 0x5b4; // MSS - 1460 - //actx->mss = 0x5a8; // MSS - 1448 - - // need to increment tail - ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; -} - -// Add a new packet to be transmitted -void ebbrt::IxgbeDriverRep::AddTx(uint64_t pa, uint64_t len, - uint64_t totallen, bool first, bool last, - uint8_t ctx, bool ip_cksum, - bool tcpudp_cksum, bool tse, int hdr_len) { - tdesc_advance_tx_rf_t* actx; - - auto tail = ixgmq_.tx_tail_; - actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail])); - - ixgmq_.tx_isctx_[tail] = false; - - actx->raw[0] = 0x0; - actx->raw[1] = 0x0; - - // pa is physical address of where send buffer exists - actx->address = reinterpret_cast(pa); - - actx->dtalen = len; - if (first) { - if(tse) { - actx->paylen = totallen - hdr_len; - } else { - actx->paylen = totallen; - } - - // checksum - actx->ifcs = 1; - - // tcp segmentation offload - if(tse) { - actx->tse = 1; - } - } - - // type is advanced - actx->dtyp = 0b0011; - actx->dext = 1; - - // set last packet bit - if (last) { - actx->eop = 1; - // rs bit should only be set when eop is set - actx->rs = 1; - } else { - actx->eop = 0; - } - - if (ctx != -1 && first) { - actx->idx = ctx; - actx->ixsm = ip_cksum; // ip checksum offload - actx->txsm = tcpudp_cksum; // udp or tcp checksum offload - } - - ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - ixgmq_.tx_tail_ = (tail + 1) % ixgmq_.tx_size_; -/*#ifdef STATS_EN - ixgmq_.stat_num_tx ++; #endif*/ } @@ -371,9 +226,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { std::unique_ptr b; tdesc_advance_tx_rf_t* arfx; tdesc_advance_ctxt_wb_t* actx; + tdesc_advance_tx_wbf_t* awbfx; uint32_t mcore = static_cast(Cpu::GetMine()); - //uint32_t free_desc = 0; - //int i; + uint32_t end, free_desc; // On TSO, the maximum PAYLEN can be up to 2^18 - 1 len = buf->ComputeChainDataLength(); @@ -382,61 +237,7 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { return; } -/*#ifdef STATS_EN - // counter initialization, only need to do once - if(ixgmq_.stat_init == true && ixgmq_.stat_start == false) { - ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); - ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); - ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); - ixgmq_.perfCycles.Start(); - ixgmq_.perfInst.Start(); - ixgmq_.perfLLC_miss.Start(); - - if(mcore == 0) { - ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); - ixgmq_.powerMeter.Start(); - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - ixgmq_.time_us = std::chrono::duration_cast(d).count(); - } - - ixgmq_.stat_start = true; - - // every 10000 sends - } else if (ixgmq_.stat_init == true && ixgmq_.stat_start == true && ixgmq_.stat_num_send % 10000 == 0) { - //stop counters - ixgmq_.perfCycles.Stop(); - ixgmq_.perfInst.Stop(); - ixgmq_.perfLLC_miss.Stop(); - ixgmq_.powerMeter.Stop(); - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - uint64_t endt = std::chrono::duration_cast(d).count(); - - // accumulate counters - ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); - ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); - ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); - ixgmq_.totalTime += ((double)(endt - (ixgmq_.time_us)) / 1000000.0); - ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); - - // clear - ixgmq_.perfCycles.Clear(); - ixgmq_.perfInst.Clear(); - ixgmq_.perfLLC_miss.Clear(); - - // restart again - ixgmq_.perfCycles.Start(); - ixgmq_.perfInst.Start(); - ixgmq_.perfLLC_miss.Start(); - ixgmq_.powerMeter.Start(); - auto dd = ebbrt::clock::Wall::Now().time_since_epoch(); - ixgmq_.time_us = std::chrono::duration_cast(dd).count(); - } - - ixgmq_.stat_num_send ++; - ixgmq_.stat_num_tx_bytes += len; - #endif*/ - - /*if(ixgmq_.tx_tail_ > ixgmq_.tx_head_) { + if(ixgmq_.tx_tail_ > ixgmq_.tx_head_) { free_desc = IxgbeDriver::NTXDESCS - (ixgmq_.tx_tail_ - ixgmq_.tx_head_); } else if(ixgmq_.tx_tail_ < ixgmq_.tx_head_){ free_desc = IxgbeDriver::NTXDESCS - ((ixgmq_.tx_tail_+IxgbeDriver::NTXDESCS) - ixgmq_.tx_head_); @@ -444,30 +245,23 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { free_desc = IxgbeDriver::NTXDESCS; } - // 40 descriptors is theoretical limit of how many descriptors can be used at once - if(free_desc < 60) { - - // from first sent descriptor - for (auto rit = ixgmq_.send_to_watch.begin(); rit != ixgmq_.send_to_watch.end(); ++rit) { - arfx = reinterpret_cast(&(ixgmq_.tx_ring_[*rit])); - - // Force memory writes to complete before letting h/w know there - // are new descriptors to fetch. (Only applicable for weak-ordered - // memory model archs, such as IA-64). - asm volatile("sfence" ::: "memory"); - - // wait until its sent - while(arfx->dd == 0) { - // makes sure all reads are finished before checking again - asm volatile("lfence":::"memory"); + // (IxgbeDriver::NTXDESCS - 1): 340 W, 1599820.2, eax=0x60 + if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { + auto head = ixgmq_.tx_head_; + auto tail = ixgmq_.tx_tail_; + + while(head != tail) { + if(ixgmq_.tx_iseop[head] == true) { + awbfx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); + while(awbfx->dd == 0) { + asm volatile("lfence":::"memory"); + } + ixgmq_.tx_iseop[head] = false; } - - // increment head ptr - ixgmq_.tx_head_ = (*rit + 1) % ixgmq_.tx_size_; - //ebbrt::kprintf("\t core=%u Reclaimed *rit=%u head=%u\n", mcore, *rit, ixgmq_.tx_head_); + head = (head + 1) % ixgmq_.tx_size_; } - ixgmq_.send_to_watch.clear(); - }*/ + ixgmq_.tx_head_ = head; + } if(buf->IsChained()) { b = MakeUniqueIOBuf(len); @@ -507,12 +301,20 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { arfx->dtyp = 0x3; arfx->dext = 1; + //ebbrt::kprintf("Send len=%d\n", len); //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); + end = static_cast(ixgmq_.tx_tail_); + ixgmq_.tx_iseop[end] = true; + //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { + if(len > IXGBE_MAX_DATA_PER_TXD) { + //start = ixgmq_.tx_tail_; + /*** CONTEXT START ***/ actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); actx->raw_1 = 0x0; actx->raw_2 = 0x0; @@ -539,8 +341,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { actx->l4len = pinfo.tcp_hdr_len; //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ - //first descriptor + //first descriptor arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; @@ -590,11 +393,17 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { //ebbrt::kprintf("Send() last descriptor mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, tsolen, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } } } else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { + //start = ixgmq_.tx_tail_; + + /*** CONTEXT START ***/ actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); actx->raw_1 = 0x0; actx->raw_2 = 0x0; @@ -619,9 +428,10 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { actx->mss = 1448; // TCP header length, with no tcp options == 20, ignored when no TSE actx->l4len = pinfo.tcp_hdr_len; - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - + /*** CONTEXT END ***/ + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; @@ -642,10 +452,14 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size arfx->paylen = pinfo.tcp_len; //ebbrt::kprintf("Send mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } else { + + //start = ixgmq_.tx_tail_; + /*** CONTEXT START ***/ actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); actx->raw_1 = 0x0; actx->raw_2 = 0x0; @@ -670,8 +484,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { actx->mss = 0; // TCP header length, with no tcp options == 20, ignored when no TSE actx->l4len = 0; - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; @@ -700,6 +515,9 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; } } @@ -711,243 +529,15 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { WriteTdt_1(mcore, ixgmq_.tx_tail_); - while(arfx->dd == 0) { + /*while(arfx->dd == 0) { // makes sure all reads are finished before checking again asm volatile("lfence":::"memory"); - } - - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - ixgmq_.time_us = std::chrono::duration_cast(d).count(); + }*/ - //rtdh = ReadTdh_1(mcore); - //rtdt = ReadTdt_1(mcore); - //ebbrt::kprintf("\t Send() core=%u After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", mcore, len, rtdh, rtdt, ixgmq_.tx_tail_); + //auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + //ixgmq_.time_send = std::chrono::duration_cast(d).count(); } -/*void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { - bool ip_cksum = false; - bool tcpudp_cksum = false; - uint64_t data; - size_t len, count; - uint32_t mcore = static_cast(Cpu::GetMine()); - -#ifdef STATS_EN - ixgmq_.stat_num_send ++; -#endif - -// TODO threshold for triggering reclaim tx buffers -#ifndef TX_HEAD_WB - size_t free_desc = - IxgbeDriver::NTXDESCS - - (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); - count = buf->CountChainElements(); - // free descripts must have enough for count in chained iobufs - if (free_desc < (count + 1)) { - // reclaim buffers - ReclaimTx(); - - free_desc = IxgbeDriver::NTXDESCS - - (std::abs(static_cast(ixgmq_.tx_tail_ - ixgmq_.tx_head_))); - // not enough descriptors got freed - if (free_desc < (count + 1)) { - ebbrt::kprintf("Not enough descriptors got freed\n"); - return; - } - } -#endif - -if (pinfo.flags & PacketInfo::kNeedsIpCsum) { - ip_cksum = true; - } - - if (pinfo.flags & PacketInfo::kNeedsCsum) { - tcpudp_cksum = true; - } - - // buffers are chained - if(buf->IsChained()) { - len = buf->ComputeChainDataLength(); - count = buf->CountChainElements(); - - if(tcpudp_cksum) { - if (pinfo.csum_offset == 6) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); - } else if (pinfo.csum_offset == 16) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); - } else { - ebbrt::kprintf("%s unknown packet type checksum\n", __FUNCTION__); - ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); - } - } - - // 7.2.1.1 - // A packet (or multiple packets in transmit segmentation) can span - // any number of buffers (and their descriptors) up to a limit of 40 minus WTHRESH minus 2 - if(count > 38) { - //ebbrt::kprintf_force("count = %d\n", count); - std::unique_ptr b; - b = MakeUniqueIOBuf(len); - auto mdata = b->MutData(); - for (auto& buf_it : *buf) { - memcpy(mdata, buf_it.Data(), buf_it.Length()); - mdata += buf_it.Length(); - } - data = reinterpret_cast(b->MutData()); - AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); - - } else { - size_t counter = 0; - for (auto& buf_it : *buf) { - counter++; - - uint64_t dlen = reinterpret_cast(buf_it.Length()); - uint64_t daddr = reinterpret_cast(buf_it.Data()); - - // first buffer - if (counter == 1) { - AddTx(daddr, dlen, len, true, false, 0, ip_cksum, tcpudp_cksum, - len > 1514, static_cast(pinfo.hdr_len)); - //last buffer - } else if (counter == count ) { - AddTx(daddr, dlen, len, false, true, 0, ip_cksum, tcpudp_cksum, - len > 1514, static_cast(pinfo.hdr_len)); - } else { - AddTx(daddr, dlen, len, false, false, 0, ip_cksum, tcpudp_cksum, - len > 1514, static_cast(pinfo.hdr_len)); - } - } - } - } else { // buffers NOT chained - data = reinterpret_cast(buf->Data()); - len = buf->ComputeChainDataLength(); - if(tcpudp_cksum) { - // check datasheet for numbers - if (pinfo.csum_offset == 6) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); - } else if (pinfo.csum_offset == 16) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); - } else { - ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); - } - - AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); - } else { - AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); - } - - // dump eth packet info - auto rtdh = ReadTdh_1(mcore); - auto rtdt = ReadTdt_1(mcore); - ebbrt::kprintf("\t Send() Before len=%d rtdh=%u %rtdt=%u tx_tail_=%u\n", len, rtdh, rtdt, (uint32_t)(ixgmq_.tx_tail_)); - auto p1 = reinterpret_cast(data); - for (int i = 0; i < (int)len; i+=8) { - if (i+8 < (int)len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } - else{ - ebbrt::kprintf("%02X\n", p1[i]); - } - } - ebbrt::kprintf("\n"); - } - - // bump tx_tail - // indicates position beyond last descriptor hw - uint32_t tail = (uint32_t)(ixgmq_.tx_tail_); - asm volatile("sfence" ::: "memory"); - - WriteTdt_1(mcore, tail); - - tdesc_advance_tx_rf_t* actx; - actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail-1])); - - while(actx->dd == 0) { - ebbrt::clock::SleepMilli(1); - } - auto rtdh = ReadTdh_1(mcore); - auto rtdt = ReadTdt_1(mcore); - ebbrt::kprintf("\t Send() After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", len, rtdh, rtdt, tail); - }*/ -/*void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { - bool ip_cksum = false; - bool tcpudp_cksum = false; - uint64_t data; - size_t len; - uint32_t mcore = static_cast(Cpu::GetMine()); - std::unique_ptr b; - - if (pinfo.flags & PacketInfo::kNeedsIpCsum) { - ip_cksum = true; - } - - if (pinfo.flags & PacketInfo::kNeedsCsum) { - tcpudp_cksum = true; - } - - len = buf->ComputeChainDataLength(); - - // buffers are chained - if(buf->IsChained()) { - b = MakeUniqueIOBuf(len); - auto mdata = b->MutData(); - for (auto& buf_it : *buf) { - memcpy(mdata, buf_it.Data(), buf_it.Length()); - mdata += buf_it.Length(); - } - data = reinterpret_cast(b->MutData()); - } else { // buffers NOT chained - data = reinterpret_cast(buf->Data()); - } - - if(tcpudp_cksum) { - // check datasheet for numbers - if (pinfo.csum_offset == 6) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_udp); - } else if (pinfo.csum_offset == 16) { - AddContext(0, ETHHDR_LEN, IPHDR_LEN, 0, l4_type_tcp); - } else { - ebbrt::kabort("%s unknown packet type checksum\n", __FUNCTION__); - } - - AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); - } else { - AddTx(data, len, len, true, true, 0, ip_cksum, tcpudp_cksum, len > 1514, static_cast(pinfo.hdr_len)); - } - - // dump eth packet info - auto rtdh = ReadTdh_1(mcore); - auto rtdt = ReadTdt_1(mcore); - ebbrt::kprintf("\t Core=%u Send() Before len=%d rtdh=%u %rtdt=%u tx_tail_=%u\n", mcore, len, rtdh, rtdt, (uint32_t)(ixgmq_.tx_tail_)); - auto p1 = reinterpret_cast(data); - for (int i = 0; i < (int)len; i+=8) { - if (i+8 < (int)len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X \n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } - else{ - ebbrt::kprintf("%02X \n", p1[i]); - } - } - ebbrt::kprintf("\n"); - - // bump tx_tail - // indicates position beyond last descriptor hw - uint32_t tail = (uint32_t)(ixgmq_.tx_tail_); - asm volatile("sfence" ::: "memory"); - - WriteTdt_1(mcore, tail); - - tdesc_advance_tx_rf_t* actx; - actx = reinterpret_cast(&(ixgmq_.tx_ring_[tail-1])); - - while(actx->dd == 0) { - ebbrt::clock::SleepMilli(1); - } - rtdh = ReadTdh_1(mcore); - rtdt = ReadTdt_1(mcore); - ebbrt::kprintf("\t Send() After len=%d rtdh=%u %rtdt=%u tail=%u\n\n", len, rtdh, rtdt, tail); - } -*/ - void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { // Disable RXCTRL - 8.2.3.8.10 bar0_.Write32(0x03000, m); @@ -2357,8 +1947,8 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // setup RX interrupts for queue i dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); - - ebbrt::kprintf("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); + + //ebbrt::kprintf("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); // don't set up interrupts for tx since we have head writeback?? auto qn = i / 2; // put into correct IVAR @@ -2473,10 +2063,9 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteTdh(i, 0x0); - // transmit queue enable - //WriteTxdctl(i, 0x1 << 25); - //WriteTxdctl(i, 0x2010120); - WriteTxdctl(i, 0x2000000); + // transmit queue enable - PTHRESH=32 HTHRESH=1 WTHRESH=1 + WriteTxdctl(i, 0x2010120); + //WriteTxdctl(i, 0x2000000); // poll until set, TODO: Timeout while (ReadTxdctl_enable(i) == 0) { @@ -2719,19 +2308,23 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.stat_num_recv ++; #endif - if(ixgmq_.time_us == 0) { - //auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - //ixgmq_.time_us = std::chrono::duration_cast(d).count(); - } else { + /*if(ixgmq_.time_send > 0) { auto d = ebbrt::clock::Wall::Now().time_since_epoch(); uint64_t endt = std::chrono::duration_cast(d).count(); - ixgmq_.ttotalt += (endt - ixgmq_.time_us); - - //ebbrt::kprintf("Core %u: time elapsed (us): %llu\n", mcore, endt - ixgmq_.time_us); - - //auto dd = ebbrt::clock::Wall::Now().time_since_epoch(); - //ixgmq_.time_us = std::chrono::duration_cast(dd).count(); - } + uint64_t idlet = endt - ixgmq_.time_send; + uint64_t idlet_mod = (idlet / 50) * 50; + + auto got = ixgmq_.idle_times_.find(idlet_mod); + // not found + if(got == ixgmq_.idle_times_.end()) + ixgmq_.idle_times_[idlet_mod] = 1; + else + ixgmq_.idle_times_[idlet_mod] ++; + + ixgmq_.time_idle_min = idlet < ixgmq_.time_idle_min ? idlet : ixgmq_.time_idle_min; + ixgmq_.time_idle_max = idlet > ixgmq_.time_idle_max ? idlet : ixgmq_.time_idle_max; + ixgmq_.total_idle_time += idlet; + }*/ // while there are still packets received while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { @@ -2873,9 +2466,11 @@ ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) ixgmq_(root.GetMultiQueue(Cpu::GetMine())), receive_callback_([this]() { ReceivePoll(); }) { //this->ReceivePoll(); - /*auto timeout = - std::chrono::seconds(1); - timer->Start(*this, timeout,true);*/ + ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + + ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); } void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StartTimer() { @@ -2889,7 +2484,32 @@ void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { uint32_t mcore = static_cast(Cpu::GetMine()); - ebbrt::kprintf_force("Core %u: Fire()\n", mcore); + + ixgmq_.perfCycles.Stop(); + ixgmq_.perfInst.Stop(); + ixgmq_.perfLLC_miss.Stop(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Stop(); + } + // accumulate counters + ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); + ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); + ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); + if(mcore == 0 || mcore == 1) { + ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); + //ebbrt::kprintf_force("Core %u: Fire() cycles=%llu ins=%llu llc=%llu energy=%.2lfJ\n", mcore, ixgmq_.totalCycles, ixgmq_.totalIns, ixgmq_.totalLLCmisses, ixgmq_.totalNrg); + } + + ixgmq_.perfCycles.Clear(); + ixgmq_.perfInst.Clear(); + ixgmq_.perfLLC_miss.Clear(); + + ixgmq_.perfCycles.Start(); + ixgmq_.perfInst.Start(); + ixgmq_.perfLLC_miss.Start(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Start(); + } } uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 6bcfcd97..ee4c8506 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -83,6 +83,7 @@ class IxgbeDriver : public EthernetDevice { //void Run(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; void Config(std::string s, uint32_t v) override; + std::string ReadNic() override; const EthernetAddress& GetMacAddress() override; protected: @@ -105,8 +106,8 @@ class IxgbeDriver : public EthernetDevice { static const constexpr uint32_t NTXDESCS = 8192; static const constexpr uint32_t NRXDESCS = 8192; #else - static const constexpr uint32_t NTXDESCS = 64; - static const constexpr uint32_t NRXDESCS = 64; + static const constexpr uint32_t NTXDESCS = 512; + static const constexpr uint32_t NRXDESCS = 512; #endif // Linux Defaults @@ -116,7 +117,7 @@ class IxgbeDriver : public EthernetDevice { //static const constexpr uint32_t RXBUFSZ = 4096; //static const constexpr uint32_t RXBUFSZ = 16384; - static const constexpr uint8_t ITR_INTERVAL = 200; + static const constexpr uint8_t ITR_INTERVAL = 8; // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us static const constexpr uint8_t RSC_DELAY = 1; @@ -144,10 +145,19 @@ class IxgbeDriver : public EthernetDevice { // TODO: should be optimized rsc_chain_.reserve(NRXDESCS+1); + // keep a log of number of idle times + idle_times_.reserve(NRXDESCS); + + // keep track of context descriptors + tx_iseop.reserve(NRXDESCS); + for (uint32_t k = 0; k < NRXDESCS; k++) { + tx_iseop[k] = false; + } + // keeps a log of descriptors where eop == 1 // used to coalesce reclaiming of tx descriptors - // once the threshold of some limit is hit - send_to_watch.reserve(NRXDESCS); + // once the threshold of some limit is hit + //send_to_watch.reserve(NRXDESCS); // RX ring buffer allocation auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); @@ -170,14 +180,14 @@ class IxgbeDriver : public EthernetDevice { tx_ring_ = static_cast(addr); // TX adv context buffer allocation - sz = align::Up(sizeof(bool) * NTXDESCS, 4096); + /*sz = align::Up(sizeof(bool) * NTXDESCS, 4096); order = Fls(sz - 1) - pmem::kPageShift + 1; page = page_allocator->Alloc(order, nid); kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", __FUNCTION__); addr = reinterpret_cast(page.ToAddr()); memset(addr, 0, sz); - tx_isctx_ = static_cast(addr); + tx_isctx_ = static_cast(addr);*/ #ifdef TX_HEAD_WB // TODO: not sure how much exactly to allocate for head wb addr @@ -222,11 +232,17 @@ class IxgbeDriver : public EthernetDevice { std::vector> circ_buffer_; std::vector> rsc_chain_; - std::vector send_to_watch; + std::unordered_map idle_times_; + std::vector> send_to_watch; + std::vector tx_iseop; + std::ostringstream str_stats; + //std::vector send_to_watch; rdesc_legacy_t* rx_ring_; tdesc_legacy_t* tx_ring_; - bool* tx_isctx_; + + //std::vector tx_isctx; + //bool* tx_isctx_; bool rsc_used; int hanc; #ifdef TX_HEAD_WB @@ -241,12 +257,19 @@ class IxgbeDriver : public EthernetDevice { uint64_t stat_num_rx_bytes{0}; uint64_t stat_num_tx_bytes{0}; uint64_t time_us{0}; - uint64_t ttotalt{0}; + uint64_t time_send{0}; + uint64_t time_idle_min{999999}; + uint64_t time_idle_max{0}; + uint64_t total_idle_time{0}; + uint64_t totalInterrupts{0}; uint64_t totalCycles{0}; uint64_t totalIns{0}; uint64_t totalLLCmisses{0}; + uint32_t rapl_val{666}; + uint32_t itr_val{8}; double totalNrg{0.0}; double totalTime{0.0}; + double totalPower{0.0}; bool stat_start{false}; bool stat_init{false}; @@ -509,10 +532,10 @@ class IxgbeDriverRep : public MulticoreEbb, Timer:: void ReclaimTx(); void ReclaimRx(); void Send(std::unique_ptr buf, PacketInfo pinfo); - void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, - enum l4_type l4type); - void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, - bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); + //void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, + // enum l4_type l4type); + //void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, + // bool last, uint8_t ctx, bool ip_cksum, bool tcpudp_cksum, bool tse, int hdr_len); void StartTimer(); void StopTimer(); diff --git a/src/native/Net.cc b/src/native/Net.cc index c3f63e4a..cb0afda9 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -64,6 +64,14 @@ void ebbrt::NetworkManager::Config(std::string s, uint32_t v) { interface_->Config(std::move(s), v); } +std::string ebbrt::NetworkManager::ReadNic() { + return interface_->ReadNic(); +} + void ebbrt::NetworkManager::Interface::Config(std::string s, uint32_t v) { ether_dev_.Config(std::move(s), v); } + +std::string ebbrt::NetworkManager::Interface::ReadNic() { + return ether_dev_.ReadNic(); +} diff --git a/src/native/Net.h b/src/native/Net.h index 4125f2f2..4dff7ae7 100644 --- a/src/native/Net.h +++ b/src/native/Net.h @@ -56,6 +56,7 @@ class EthernetDevice { virtual void Send(std::unique_ptr buf, PacketInfo pinfo = PacketInfo()) = 0; virtual void Config(std::string s, uint32_t v) = 0; + virtual std::string ReadNic() = 0; virtual const EthernetAddress& GetMacAddress() = 0; virtual ~EthernetDevice() {} }; @@ -248,6 +249,7 @@ class NetworkManager : public StaticSharedEbb { void SendIp(std::unique_ptr buf, Ipv4Address src, Ipv4Address dst, uint8_t proto, PacketInfo pinfo = PacketInfo()); void Config(std::string s, uint32_t v); + std::string ReadNic(); const EthernetAddress& MacAddress(); const ItfAddress* Address() const { return address_.get(); } @@ -315,6 +317,7 @@ class NetworkManager : public StaticSharedEbb { Interface& NewInterface(EthernetDevice& ether_dev); Ipv4Address IpAddress(); void Config(std::string s, uint32_t v); + std::string ReadNic(); private: Future StartDhcp(); diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index 11325f98..af6a64e8 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -1144,98 +1144,15 @@ void ebbrt::NetworkManager::TcpEntry::SendSegment(TcpSegment& segment) { pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - - //auto local_ip = htonl(address.toU32()); - //auto remote_ip = htonl((std::get<0>(key)).toU32()); - - //ebbrt::kprintf("SendSegment() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X htonl(ack_num)=0x%X\n", local_ip, remote_ip, kIpProtoTCP, segment.buf->ComputeChainDataLength(), segment.th.src_port, segment.th.dst_port, segment.th.seqno, segment.th.ackno, htonl(segment.th.ackno)); - //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X sizeof(TcpHeader)=%d tcp_header.checksum=0x%X\n", segment.th.hdrlen_flags, segment.th.wnd, 0, sizeof(TcpHeader), segment.th.checksum); - - /*uint32_t i, tmp, count, sum, len; - uint16_t word16; - - tmp = count = sum = len = 0x0; - - if(segment.buf->IsChained()) { - for (auto& buf_it : *(segment.buf)) { - auto p1 = reinterpret_cast(buf_it.Data()); - len += buf_it.Length(); - //if(b == 0 && p1[0] == 0x81 && p1[1] == 0x0) { -// b = 1; -// pinfo.flags |= PacketInfo::kNeedsCsum; -// } - for (i = 0; i < buf_it.Length(); i++) { - if (count < 2) { - tmp = tmp | (p1[i] << ((1-count) * 8)); - count ++; - - if(count == 2) { - sum += tmp; - tmp = count = 0x0; - } - } - } - } - } else { - uint8_t* p1 = reinterpret_cast(segment.buf->MutData()); - len = segment.buf->ComputeChainDataLength(); - for (i = 0; i < len; i++) { - if (count < 2) { - tmp = tmp | (p1[i] << ((1-count) * 8)); - count ++; - - if(count == 2) { - sum += tmp; - tmp = count = 0x0; - } - } - } - } - - sum += tmp; - - // pseudo header start - //add src addr - word16 = (local_ip & 0xFFFF); - sum = sum + (uint32_t)word16; - word16 = ((local_ip >> 16) & 0xFFFF); - sum = sum + (uint32_t)word16; - - //add dst addr - word16 = (remote_ip & 0xFFFF); - sum = sum + (uint32_t)word16; - word16 = ((remote_ip >> 16) & 0xFFFF); - sum = sum + (uint32_t)word16; - - //add protocol number - sum = sum + kIpProtoTCP + len; - - while(sum >> 16) { - sum = (sum & 0xFFFF) + (sum >> 16); - } - sum = (~sum) & 0xFFFF; - segment.th.checksum = htons((uint16_t) sum); - // pseudo header end - */ - //OffloadPseudoCsumTso(kIpProtoTCP, address, std::get<0>(key)); - pinfo.tcp_hdr_len = segment.th.HdrLen(); pinfo.tcp_len = len - pinfo.tcp_hdr_len; - //ebbrt::kprintf("\nSendSegment() total_len=%u len=%u tcp_hdr_len=%u tcp_len=%u checksum=0x%X\n", totallen, len, pinfo.tcp_hdr_len, pinfo.tcp_len, segment.th.checksum); - /*if(b) { - ebbrt::kprintf("len=%u checksum=0x%X\n\n", len, segment.th.checksum); - }*/ - // XXX: Actually store the MSS instead of making this assumption size_t mss = 1460; if (segment.tcp_len > mss) { pinfo.gso_type = PacketInfo::kGsoTcpv4; pinfo.hdr_len = segment.th.HdrLen(); pinfo.gso_size = mss; -//#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ - -//#endif } network_manager->SendIp(CreateRefChain(*(segment.buf)), address, @@ -1249,7 +1166,7 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, uint16_t local_port, uint16_t remote_port) { - ebbrt::kabort("ebbrt::NetworkManager::TcpReset() - Aborting haven't added checksum offloading\n"); +// ebbrt::kabort("ebbrt::NetworkManager::TcpReset() - Aborting haven't added checksum offloading\n"); auto buf = MakeUniqueIOBuf(sizeof(TcpHeader) + sizeof(Ipv4Header) + sizeof(EthernetHeader)); @@ -1265,8 +1182,8 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, tcp_header.SetHdrLenFlags(sizeof(TcpHeader), kTcpRst | (ack ? kTcpAck : 0)); tcp_header.wnd = htons(TcpWindow16(kTcpWnd)); tcp_header.urgp = 0; - tcp_header.checksum = 0; - //OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); + tcp_header.checksum = + OffloadPseudoCsum(*buf, kIpProtoTCP, local_ip, remote_ip); //ebbrt::kprintf("TcpReset() src_ip=0x%llX dst_ip=0x%llX kIpProtoTCP=0x%X buf_len=0x%X src_port=0x%X dst_port=0x%X seq_num=0x%X ack_num=0x%X\n", local_ip.toU32(), remote_ip.toU32(), kIpProtoTCP, 0, tcp_header.src_port, tcp_header.dst_port, tcp_header.seqno, tcp_header.ackno); //ebbrt::kprintf("\t flags=0x%X windows=0x%X urgent_pointer=0x%X tcp_header.checksum=0x%X \n\n", tcp_header.hdrlen_flags, tcp_header.wnd, tcp_header.urgp, tcp_header.checksum); @@ -1275,6 +1192,8 @@ void ebbrt::NetworkManager::TcpReset(bool ack, uint32_t seqno, uint32_t ackno, pinfo.flags |= PacketInfo::kNeedsCsum; pinfo.csum_start = 0; // 14 byte eth header + 20 byte ip header pinfo.csum_offset = 16; // checksum is 16 bytes into the TCP header - + pinfo.tcp_len = 0; + pinfo.tcp_hdr_len = tcp_header.HdrLen(); + SendIp(std::move(buf), local_ip, remote_ip, kIpProtoTCP, pinfo); } diff --git a/src/native/Rapl.h b/src/native/Rapl.h index 5bd2ca8d..1b2a326b 100644 --- a/src/native/Rapl.h +++ b/src/native/Rapl.h @@ -111,6 +111,45 @@ namespace rapl { //ebbrt::kprintf("Total Package Energy used: %.6fJ\n", after - counter_offset); } + void SetLimit(uint32_t v) { + uint64_t result = ebbrt::msr::Read(kMsrPkgRaplPowerLimit); + uint64_t m = 0x7FFF; + uint32_t npower = (uint32_t)(v / 0.125); + + // resetting values + result = result & (~m); + result = result & (~(m << 32)); + + // new power + result = result | npower; + result = result | ((uint64_t)npower << 32); + + // set clamp + result |= 1LL << 15; + result |= 1LL << 16; + result |= 1LL << 47; + result |= 1LL << 48; + + uint32_t low = result & 0xFFFFFFFF; + uint32_t high = (result >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(kMsrPkgRaplPowerLimit), "a"(low), "d"(high)); + + result=ebbrt::msr::Read(kMsrPkgRaplPowerLimit); + ebbrt::kprintf("%u Package power limits are %s\n", v, (result >> 63) ? "locked" : "unlocked"); + double pkg_power_limit_1 = rapl_power_units*(double)((result>>0)&0x7FFF); + double pkg_time_window_1 = rapl_time_units*(double)((result>>17)&0x007F); + ebbrt::kprintf("Package power limit #1: %.3fW for %.6fs (%s, %s)\n", + pkg_power_limit_1, pkg_time_window_1, + (result & (1LL<<15)) ? "enable power limit" : "disabled", + (result & (1LL<<16)) ? "clamped" : "not_clamped"); + double pkg_power_limit_2 = rapl_power_units*(double)((result>>32)&0x7FFF); + double pkg_time_window_2 = rapl_time_units*(double)((result>>49)&0x007F); + ebbrt::kprintf("Package power limit #2: %.3fW for %.6fs (%s, %s)\n", + pkg_power_limit_2, pkg_time_window_2, + (result & (1LL<<47)) ? "enable power limit" : "disabled", + (result & (1LL<<48)) ? "clamped" : "not_clamped"); + } + double Read(); private: double rapl_power_units{0.0}; diff --git a/src/native/VirtioNet.cc b/src/native/VirtioNet.cc index 07c982e5..85a3ec22 100644 --- a/src/native/VirtioNet.cc +++ b/src/native/VirtioNet.cc @@ -165,6 +165,7 @@ ebbrt::VirtioNetRep::VirtioNetRep(const VirtioNetDriver& root) circ_buffer_tail_(0) {} void ebbrt::VirtioNetDriver::Config(std::string s, uint32_t v) {} +std::string ebbrt::VirtioNetDriver::ReadNic() { return ""; } void ebbrt::VirtioNetDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { diff --git a/src/native/VirtioNet.h b/src/native/VirtioNet.h index dac8b043..4387d46e 100644 --- a/src/native/VirtioNet.h +++ b/src/native/VirtioNet.h @@ -25,6 +25,7 @@ class VirtioNetDriver : public VirtioDriver, static uint32_t GetDriverFeatures(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; void Config(std::string s, uint32_t v) override; + std::string ReadNic() override; const EthernetAddress& GetMacAddress() override; private: From bef8f3ad6b895dcb55f6b314faf407481e1cc524 Mon Sep 17 00:00:00 2001 From: Han Date: Wed, 8 Apr 2020 14:52:57 -0400 Subject: [PATCH 15/20] added preliminary RSC capability --- src/native/Debug.cc | 3 - src/native/EventManager.cc | 11 +- src/native/Ixgbe.h | 6 + src/native/IxgbeDriver.cc | 1687 +++++++++++++++++--------- src/native/IxgbeDriver.h | 79 +- src/native/Net.cc | 17 +- src/native/NetTcp.cc | 3 +- src/native/Newlib.cc | 177 ++- src/native/Newlib.h | 40 +- src/native/Perf.cc | 6 +- toolchain/patches/newlib-2.0.0.patch | 161 ++- 11 files changed, 1562 insertions(+), 628 deletions(-) diff --git a/src/native/Debug.cc b/src/native/Debug.cc index fc769c2a..1f45a06d 100644 --- a/src/native/Debug.cc +++ b/src/native/Debug.cc @@ -12,9 +12,6 @@ void ebbrt::kvprintf(const char* __restrict format, va_list va) { auto len = vsnprintf(nullptr, 0, format, va); char buffer[len + 1]; // NOLINT vsnprintf(buffer, len + 1, format, va2); -#ifdef __EBBRT_ENABLE_BAREMETAL_NIC__ - console::Write("\r"); -#endif console::Write(buffer); } diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index 79829ba8..97de8cb5 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -162,11 +162,18 @@ void ebbrt::EventManager::Process() { asm volatile(".byte 0x0f, 0x01, 0xc8;" :: "a" ((void*)&flags), "c" (ecx), "d"(edx)); + + // https://elixir.bootlin.com/linux/v4.15.1/source/arch/x86/include/asm/mwait.h#L100 ecx = 1; //eax = 0x20; - eax = 0x60; - //eax = 0x30; + //eax = 0x60; + + // C1E state + //eax = 0x1; + + // C7 state + eax = 0x30; asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h index f187386e..6e281acb 100644 --- a/src/native/Ixgbe.h +++ b/src/native/Ixgbe.h @@ -51,6 +51,12 @@ enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; #define RXFLAG_L4CS (1 << 2) #define RXFLAG_L4CS_VALID (1 << 3) +#define IXGBE_RX_BUFFER_WRITE 15 + +#define mb() asm volatile("mfence" ::: "memory") +#define rmb() asm volatile("lfence" ::: "memory") +#define wmb() asm volatile("sfence" ::: "memory") + /*********************** * RX * Descriptors diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index f5c25de6..ff81f87b 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -82,22 +82,31 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ixgmq[i]->powerMeter.SetLimit(v); } } else if(s == "start_perf") { - ixgmq[i]->stat_num_recv = 0; + ebbrt::kprintf_force("start_perf %d\n", i); + for(int j=0;j < 100;j++) { + ixgmq[i]->rx_desc_counts[j] = 0; + ixgmq[i]->tx_desc_counts[j] = 0; + } + + //ebbrt::kprintf_force("%d: %d %d\n", i, ixgmq[i]->tx_desc_counts.size(), ixgmq[i]->rx_desc_counts.size()); + /*ixgmq[i]->stat_num_recv = 0; ixgmq[i]->time_us = 0; ixgmq[i]->totalCycles = 0; ixgmq[i]->totalIns = 0; ixgmq[i]->totalLLCmisses = 0; ixgmq[i]->totalNrg = 0; + ixgmq[i]->fireCount = 0; ixgmq[i]->perfCycles.Start(); ixgmq[i]->perfInst.Start(); - ixgmq[i]->perfLLC_miss.Start(); + ixgmq[i]->perfLLC_miss.Start(); + if(i == 0 || i == 1) { auto d = ebbrt::clock::Wall::Now().time_since_epoch(); ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); ixgmq[i]->powerMeter.Start(); } - ebb_->StartTimer(); + ebb_->StartTimer();*/ } else if(s == "stop_perf") { ixgmq[i]->perfCycles.Stop(); @@ -128,7 +137,7 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ebb_->StopTimer(); } else if(s == "print") { - uint64_t cycs, ins, llc, nints; + /*uint64_t cycs, ins, llc, nints; double ttime, tnrg; ttime = tnrg = 0.0; cycs = ins = llc = nints = 0; @@ -162,8 +171,17 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ebbrt::kprintf_force("TIME=%.2f\n", ttime); ebbrt::kprintf_force("WATTS=%.2f\n", tnrg/ttime); ebbrt::kprintf_force("AVG_ITR_PER_CORE=%.2f\n", (float)nints/static_cast(Cpu::Count())); - ebbrt::kprintf_force("\n"); - + ebbrt::kprintf_force("\n");*/ + + for(int j=0;j<40;j++) { + uint32_t sumt = 0; + uint32_t sumr = 0; + for(uint32_t c = 0; c < static_cast(Cpu::Count()); c++) { + sumt += ixgmq[c]->tx_desc_counts[j]; + sumr += ixgmq[c]->rx_desc_counts[j]; + } + ebbrt::kprintf_force("%d,%d,%d\n", j, sumt, sumr); + } //ebbrt::kprintf_force("nrg=%.2lf J\n", tnrg); //ebbrt::kprintf_force("ttime=%.2f s time1=%.2f s time2=%.2f s\n", ttime, ixgmq[0]->totalTime, ixgmq[1]->totalTime); @@ -194,52 +212,14 @@ void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { // tx queue so that it can be used again // TX_HEAD_WB does it automatically void ebbrt::IxgbeDriverRep::ReclaimTx() { -/*#ifndef TX_HEAD_WB - size_t head = ixgmq_.tx_head_; - size_t tail = ixgmq_.tx_tail_; - tdesc_advance_tx_wbf_t* actx; - - // go through all descriptors owned by HW - while (head != tail) { - actx = reinterpret_cast(&(ixgmq_.tx_ring_[head])); - - // if context - if (ixgmq_.tx_isctx[head]) { - head = (head + 1) % ixgmq_.tx_size_; - ixgmq_.tx_isctx[head] = false; - } - // if non eop - else if (!(actx->dd)) { - head = (head + 1) % ixgmq_.tx_size_; - } - // eop - else if (actx->dd) { - head = (head + 1) % ixgmq_.tx_size_; - ixgmq_.tx_head_ = head; - } - } - #endif*/ -} - -void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { - uint64_t data, len, tsodata, tsolen; - std::unique_ptr b; - tdesc_advance_tx_rf_t* arfx; - tdesc_advance_ctxt_wb_t* actx; + // with TX head writeback, shouldn't need to poll anymore (right?) +#ifndef TX_HEAD_WB tdesc_advance_tx_wbf_t* awbfx; - uint32_t mcore = static_cast(Cpu::GetMine()); - uint32_t end, free_desc; + uint32_t free_desc; - // On TSO, the maximum PAYLEN can be up to 2^18 - 1 - len = buf->ComputeChainDataLength(); - if (len > 262144) { - ebbrt::kprintf_force("\t kabort Send() len=%u greater than TSO limit of 262144 bytes\n", len); - return; - } - if(ixgmq_.tx_tail_ > ixgmq_.tx_head_) { free_desc = IxgbeDriver::NTXDESCS - (ixgmq_.tx_tail_ - ixgmq_.tx_head_); - } else if(ixgmq_.tx_tail_ < ixgmq_.tx_head_){ + } else if(ixgmq_.tx_tail_ < ixgmq_.tx_head_) { free_desc = IxgbeDriver::NTXDESCS - ((ixgmq_.tx_tail_+IxgbeDriver::NTXDESCS) - ixgmq_.tx_head_); } else { free_desc = IxgbeDriver::NTXDESCS; @@ -262,7 +242,17 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { } ixgmq_.tx_head_ = head; } +#endif +} + +void ebbrt::IxgbeDriverRep::SendUdp(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { + uint64_t data; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + uint32_t end; + // coalesce into single packet if no checksum + //ebbrt::kprintf_force("SendUdp len=%d\n", len); if(buf->IsChained()) { b = MakeUniqueIOBuf(len); auto mdata = b->MutData(); @@ -274,164 +264,276 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { } else { data = reinterpret_cast(buf->Data()); } + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + + arfx->address = data; - // if no IP/TCP checksum - if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + + // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. + arfx->paylen = len; + + // crc checksum + arfx->ifcs = 1; + + // rs bit should only be set when eop is set + arfx->eop = 1; + arfx->rs = 1; + + // type is advanced + arfx->dtyp = 0x3; + arfx->dext = 1; + + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); + end = static_cast(ixgmq_.tx_tail_); + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + +} + +void ebbrt::IxgbeDriverRep::SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { + uint64_t data, tsodata, tsolen; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + tdesc_advance_ctxt_wb_t* actx; + uint32_t end; + + data = reinterpret_cast(buf->Data()); + //ebbrt::kprintf_force("SendTCPUnchained len=%llu\n", len); + + if(len > IXGBE_MAX_DATA_PER_TXD) { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0x2 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + //first descriptor arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; - arfx->address = data; - // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. // Max length is 15.5 KB - arfx->dtalen = len; - - // In a single-send packet, PAYLEN defines the entire packet size fetched from host memory. - arfx->paylen = len; - - // crc checksum - arfx->ifcs = 1; - - // rs bit should only be set when eop is set - arfx->eop = 1; - arfx->rs = 1; - - // type is advanced + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; arfx->dtyp = 0x3; + arfx->ifcs = 1; arfx->dext = 1; - - //ebbrt::kprintf("Send len=%d\n", len); - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); - end = static_cast(ixgmq_.tx_tail_); - ixgmq_.tx_iseop[end] = true; - //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); + arfx->tse = 1; + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length + arfx->paylen = pinfo.tcp_len; ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - - } else { - - if(len > IXGBE_MAX_DATA_PER_TXD) { - //start = ixgmq_.tx_tail_; - /*** CONTEXT START ***/ - actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - actx->raw_1 = 0x0; - actx->raw_2 = 0x0; - actx->iplen = IPHDR_LEN; - actx->maclen = ETHHDR_LEN; - // ip packet type = ipv4: 01 - actx->ipv4 = 1; - - if (pinfo.csum_offset == 6) { - // l4type = udp: 00 - actx->l4t = 0; - } else if (pinfo.csum_offset == 16) { - // l4type = tcp: 01 - actx->l4t = 1; - } - // for context descriptor 0x2 - actx->dtyp = 0x2; - // descriptor extension, one for advanced mode - actx->dext = 1; - // from Linux??, ignored when no TSE - actx->mss = 1448; - // TCP header length, with no tcp options == 20, ignored when no TSE - actx->l4len = pinfo.tcp_hdr_len; - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - /*** CONTEXT END ***/ + tsodata = data; + tsolen = len; - //first descriptor + while(tsolen > IXGBE_MAX_DATA_PER_TXD) { + tsodata += IXGBE_MAX_DATA_PER_TXD; + tsolen -= IXGBE_MAX_DATA_PER_TXD; + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; - arfx->address = data; - // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. - // Max length is 15.5 KB - arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; arfx->dtyp = 0x3; - arfx->ifcs = 1; arfx->dext = 1; arfx->tse = 1; - arfx->ixsm = 1; - arfx->txsm = 1; - // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length - arfx->paylen = pinfo.tcp_len; - //ebbrt::kprintf("Send() first descriptor mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - - tsodata = data; - tsolen = len; - - while(tsolen > IXGBE_MAX_DATA_PER_TXD) { - tsodata += IXGBE_MAX_DATA_PER_TXD; - tsolen -= IXGBE_MAX_DATA_PER_TXD; - - arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - arfx->raw[0] = 0x0; - arfx->raw[1] = 0x0; - arfx->dtyp = 0x3; - arfx->dext = 1; - arfx->tse = 1; - arfx->ifcs = 1; - arfx->address = tsodata; + arfx->ifcs = 1; + arfx->address = tsodata; - if(tsolen > IXGBE_MAX_DATA_PER_TXD) { - arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; - //ebbrt::kprintf("Send() middle descriptor(s) mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, IXGBE_MAX_DATA_PER_TXD, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - } else { - // last descriptor - arfx->dtalen = tsolen; - arfx->eop = 1; - arfx->rs = 1; + if(tsolen > IXGBE_MAX_DATA_PER_TXD) { + arfx->dtalen = IXGBE_MAX_DATA_PER_TXD; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { + // last descriptor + arfx->dtalen = tsolen; + arfx->eop = 1; + arfx->rs = 1; - //ebbrt::kprintf("Send() last descriptor mcore=%u tail=%u dtalen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, tsolen, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); - end = ixgmq_.tx_tail_; - ixgmq_.tx_iseop[end] = true; - //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - } - } - } - else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { - //start = ixgmq_.tx_tail_; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } + } + } + else if(len > 1490 && len < IXGBE_MAX_DATA_PER_TXD) { - /*** CONTEXT START ***/ - actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - actx->raw_1 = 0x0; - actx->raw_2 = 0x0; - actx->iplen = IPHDR_LEN; - actx->maclen = ETHHDR_LEN; - // ip packet type = ipv4: 01 - actx->ipv4 = 1; + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; - if (pinfo.csum_offset == 6) { - // l4type = udp: 00 - actx->l4t = 0; - } else if (pinfo.csum_offset == 16) { - // l4type = tcp: 01 - actx->l4t = 1; - } + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } - // for context descriptor 0b0010 - actx->dtyp = 0x2; - // descriptor extension, one for advanced mode - actx->dext = 1; - // from Linux??, ignored when no TSE - actx->mss = 1448; - // TCP header length, with no tcp options == 20, ignored when no TSE - actx->l4len = pinfo.tcp_hdr_len; - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - /*** CONTEXT END ***/ + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + arfx->dext = 1; + arfx->tse = 1; + + arfx->ixsm = 1; + arfx->txsm = 1; + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size + arfx->paylen = pinfo.tcp_len; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } else { /**** NOT IN TSE mode****/ + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux, ignored when no TSE + actx->mss = 0; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = 0; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + arfx->raw[0] = 0x0; + arfx->raw[1] = 0x0; + arfx->address = data; + + // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. + // Max length is 15.5 KB + arfx->dtalen = len; + arfx->paylen = len; + + arfx->dtyp = 0x3; + arfx->eop = 1; + arfx->rs = 1; + arfx->ifcs = 1; + + arfx->dext = 1; + arfx->tse = 0; + + arfx->ixsm = 1; + arfx->txsm = 1; + end = ixgmq_.tx_tail_; + ixgmq_.tx_iseop[end] = true; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + } +} + +void ebbrt::IxgbeDriverRep::SendTCPChained(std::unique_ptr buf, uint64_t len, uint64_t num_chains, PacketInfo pinfo) { + uint64_t data, i; + std::unique_ptr b; + tdesc_advance_tx_rf_t* arfx; + tdesc_advance_ctxt_wb_t* actx; + uint32_t end; + //uint32_t mcore = static_cast(Cpu::GetMine()); + + //ebbrt::kprintf_force("** SendTCPChained num_chains=%llu len=%llu START ** \n", num_chains, len); + if(len <= 1490) { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; + + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } + // for context descriptor 0b0010 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux, ignored when no TSE + actx->mss = 0; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = 0; + //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + i = 0; + for (auto& buf_it : *buf) { + data = reinterpret_cast(buf_it.Data()); + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; @@ -439,103 +541,167 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. // Max length is 15.5 KB - arfx->dtalen = len; - arfx->dtyp = 0x3; - arfx->eop = 1; + arfx->dtalen = buf_it.Length(); + // only valid on first descriptor of packet + if(i == 0) { + arfx->paylen = len; + } + arfx->rs = 1; + arfx->dtyp = 0x3; + //only valid on last descriptor making up packet + if(i == (num_chains-1)) { + arfx->eop = 1; + } + arfx->ifcs = 1; + arfx->dext = 1; - arfx->tse = 1; + arfx->tse = 0; arfx->ixsm = 1; + + // if need TCP checksum offload arfx->txsm = 1; - // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload size - arfx->paylen = pinfo.tcp_len; - //ebbrt::kprintf("Send mcore=%u tail=%u dtalen=%u paylen=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, len, pinfo.tcp_len, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); end = ixgmq_.tx_tail_; ixgmq_.tx_iseop[end] = true; - //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - } else { - - //start = ixgmq_.tx_tail_; - /*** CONTEXT START ***/ - actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); - actx->raw_1 = 0x0; - actx->raw_2 = 0x0; - actx->iplen = IPHDR_LEN; - actx->maclen = ETHHDR_LEN; - // ip packet type = ipv4: 01 - actx->ipv4 = 1; + i++; + } + } else { + /*** CONTEXT START ***/ + actx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); + actx->raw_1 = 0x0; + actx->raw_2 = 0x0; + actx->iplen = IPHDR_LEN; + actx->maclen = ETHHDR_LEN; + // ip packet type = ipv4: 01 + actx->ipv4 = 1; - if (pinfo.csum_offset == 6) { - // l4type = udp: 00 - actx->l4t = 0; - } else if (pinfo.csum_offset == 16) { - // l4type = tcp: 01 - actx->l4t = 1; - } + if (pinfo.csum_offset == 6) { + // l4type = udp: 00 + actx->l4t = 0; + } else if (pinfo.csum_offset == 16) { + // l4type = tcp: 01 + actx->l4t = 1; + } - // for context descriptor 0b0010 - actx->dtyp = 0x2; - // descriptor extension, one for advanced mode - actx->dext = 1; - // from Linux, ignored when no TSE - actx->mss = 0; - // TCP header length, with no tcp options == 20, ignored when no TSE - actx->l4len = 0; - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_ctxt_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, actx->raw_1, (uint32_t)(actx->raw_2 & 0xFFFFFFFF), (uint32_t)((actx->raw_2 >> 32) & 0xFFFFFFFF)); - ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - /*** CONTEXT END ***/ + // for context descriptor 0x2 + actx->dtyp = 0x2; + // descriptor extension, one for advanced mode + actx->dext = 1; + // from Linux??, ignored when no TSE + actx->mss = 1448; + // TCP header length, with no tcp options == 20, ignored when no TSE + actx->l4len = pinfo.tcp_hdr_len; + ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; + /*** CONTEXT END ***/ + + i = 0; + for (auto& buf_it : *buf) { + if (buf_it.Length() > IXGBE_MAX_DATA_PER_TXD) { + ebbrt::kprintf("TSE buf_it.Length() = %u > IXGBE_MAX_DATA_PER_TXD\n", buf_it.Length()); + return; + } + data = reinterpret_cast(buf_it.Data()); + arfx = reinterpret_cast(&(ixgmq_.tx_ring_[ixgmq_.tx_tail_])); arfx->raw[0] = 0x0; arfx->raw[1] = 0x0; arfx->address = data; - // Holds length in bytes of data buffer at the address pointed to by this specific descriptor. // Max length is 15.5 KB - arfx->dtalen = len; - arfx->paylen = len; - + arfx->dtalen = buf_it.Length(); //GBE_MAX_DATA_PER_TXD; arfx->dtyp = 0x3; - arfx->eop = 1; - arfx->rs = 1; arfx->ifcs = 1; - arfx->dext = 1; - arfx->tse = 0; - - arfx->ixsm = 1; + arfx->tse = 1; + + //ebbrt::kprintf_force("\t SendTCPChained i=%u len=%llu\n", i, buf_it.Length()); - // if need TCP checksum offload - //if (pinfo.flags & PacketInfo::kNeedsCsum) { - arfx->txsm = 1; - //} - //ebbrt::kprintf("Send mcore=%u tail=%u tx_adv_rd_desc = 0x%llX 0x%X 0x%X\n", mcore, ixgmq_.tx_tail_, arfx->raw[0], (uint32_t)(arfx->raw[1] & 0xFFFFFFFF), (uint32_t)((arfx->raw[1] >> 32) & 0xFFFFFFFF)); - //ixgmq_.tx_last_tail_ = ixgmq_.tx_tail_; - //ixgmq_.send_to_watch.emplace_back(ixgmq_.tx_tail_); - end = ixgmq_.tx_tail_; - ixgmq_.tx_iseop[end] = true; - //ixgmq_.send_to_watch.emplace_back(std::make_pair(start, end)); + // first descriptor + if (i == 0) { + arfx->ixsm = 1; + arfx->txsm = 1; + + // In Tcp Segmentation Mode (TSE), PAYLEN defines the TCP/UDP payload length, so no header length + // only valid on first descriptor + arfx->paylen = pinfo.tcp_len; + } else if(i == (num_chains-1)) { // last descriptor + arfx->eop = 1; + arfx->rs = 1; + arfx->dtalen = buf_it.Length(); + } else { + arfx->dtalen = buf_it.Length(); + } + + i ++; ixgmq_.tx_tail_ = (ixgmq_.tx_tail_ + 1) % ixgmq_.tx_size_; - } + //asm volatile("sfence" ::: "memory"); + // WriteTdt_1(mcore, ixgmq_.tx_tail_); + } } - //ebbrt::kprintf("\t Send() core=%u head=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_tail_, free_desc); + //ebbrt::kprintf_force("** SendTcpChained END **\n"); +} + +void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { + uint64_t len, num_chains; + uint32_t mcore = static_cast(Cpu::GetMine()); + std::unique_ptr b; + + // On TSO, the maximum PAYLEN can be up to 2^18 - 1 + len = buf->ComputeChainDataLength(); + if (len > 262144) { + ebbrt::kprintf_force("\t kabort Send() len=%lld greater than TSO limit of 262144 bytes\n", len); + return; + } + num_chains = buf->CountChainElements(); + + ReclaimTx(); + // if no IP/TCP checksum - likely UDP packet + if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + SendUdp(std::move(buf), len, pinfo); + } else { // TCP Packet + + // hardware limits sending over 40 descriptors per packet, have to manually coalesce here + // hopefully not too often + if(num_chains > 38) { + ixgmq_.tx_desc_counts[39] ++; + //ebbrt::kprintf_force("*** num_chains=%d > 38\n", num_chains); + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + //data = reinterpret_cast(b->MutData()); + SendTCPUnchained(std::move(b), len, pinfo); + + } else if(buf->IsChained() && num_chains <= 38) { + ixgmq_.tx_desc_counts[num_chains] ++; + SendTCPChained(std::move(buf), len, num_chains, pinfo); + } else { //Not Chained + ixgmq_.tx_desc_counts[1] ++; + SendTCPUnchained(std::move(buf), len, pinfo); + } + } + //ebbrt::kprintf("\t Send() core=%u head=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_tail_, free_desc); asm volatile("sfence" ::: "memory"); //ebbrt::kprintf("\t Send() core=%u head=%u last_tail=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_last_tail_, ixgmq_.tx_tail_, free_desc); WriteTdt_1(mcore, ixgmq_.tx_tail_); - - /*while(arfx->dd == 0) { - // makes sure all reads are finished before checking again - asm volatile("lfence":::"memory"); - }*/ + + //while(arfx->dd == 0) { + // makes sure all reads are finished before checking again + //asm volatile("lfence":::"memory"); + //} //auto d = ebbrt::clock::Wall::Now().time_since_epoch(); //ixgmq_.time_send = std::chrono::duration_cast(d).count(); + } void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { @@ -565,8 +731,9 @@ void ebbrt::IxgbeDriver::WriteEiam(uint32_t n, uint32_t m) { // 8.2.3.5.18 - General Purpose Interrupt Enable — GPIE (0x00898; RW) void ebbrt::IxgbeDriver::WriteGpie(uint32_t m) { - auto reg = bar0_.Read32(0x00898); - bar0_.Write32(0x00898, reg | m); + //auto reg = bar0_.Read32(0x00898); + //bar0_.Write32(0x00898, reg | m); + bar0_.Write32(0x00898, m); } // 8.2.3.5.1 Extended Interrupt Cause Register- EICR (0x00800; RW1C) @@ -1008,16 +1175,15 @@ void ebbrt::IxgbeDriver::WriteRxcsum(uint32_t m) { // 8.2.3.8.13 RSC Control — RSCCTL[n] (0x0102C + 0x40*n, n=0...63 // and 0x0D02C + 0x40*(n-64), n=64...127; RW) void ebbrt::IxgbeDriver::WriteRscctl(uint32_t n, uint32_t m) { - auto reg = bar0_.Read32(0x0102C + 0x40 * n); - bar0_.Write32(0x0102C + 0x40 * n, reg | m); + //auto reg = bar0_.Read32(0x0102C + 0x40 * n); + //bar0_.Write32(0x0102C + 0x40 * n, reg | m); + bar0_.Write32(0x0102C + 0x40 * n, m); } // 8.2.3.7.4 Packet Split Receive Type Register — PSRTYPE[n] // (0x0EA00 + 4*n, n=0...63 / 0x05480 + 4*n, n=0...15; RW) void ebbrt::IxgbeDriver::WritePsrtype(uint32_t n, uint32_t m) { - //auto reg = bar0_.Read32(0x0EA00 + 0x40 * n); - auto reg = bar0_.Read32(0x05480 + 0x40 * n); - bar0_.Write32(0x0EA00 + 0x40 * n, reg | m); + bar0_.Write32(0x0EA00 + 4 * n, m); } void ebbrt::IxgbeDriver::WritePsrtypeZero(uint32_t n) { @@ -1298,6 +1464,11 @@ void ebbrt::IxgbeDriver::WriteHlreg0(uint32_t m) { bar0_.Write32(0x04240, m); } +// 8.2.3.22.13 Max Frame Size — MAXFRS (0x04268; RW) +void ebbrt::IxgbeDriver::WriteMaxfrs(uint32_t m) { + bar0_.Write32(0x04268, m); +} + // 8.2.3.8.5 Receive Descriptor Tail — RDT[n] (0x01018 + 0x40*n, n=0...63 and // 0x0D018 + 0x40*(n-64), n=64...127; RW) void ebbrt::IxgbeDriver::WriteRdt_1(uint32_t n, uint32_t m) { @@ -1634,21 +1805,26 @@ void ebbrt::IxgbeDriver::Init() { /* setup msix */ // switch to msix mode - WriteGpie(0x1 << 4); // Multiple_MSIX - WriteGpie(0x1 << 5); // OCD - WriteGpie(0x1 << 31); // PBA_support + //WriteGpie(0x1 << 4); // Multiple_MSIX + //WriteGpie(0x1 << 5); // OCD + //WriteGpie(0x1 << 31); // PBA_support // Enable auto masking of interrupt - WriteGpie(0x1 << 30); // EIAME + //WriteGpie(0x1 << 30); // EIAME // TODO: Set up management interrupt handler - -#ifdef RSC_EN + //WriteGpie(0xC0000036); + uint32_t gpie = 0xC0000036 | (0x7 << 11); + //uint32_t gpie = 0xC0000036 | (IxgbeDriver::RSC_DELAY << 11); + WriteGpie(gpie); + ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d us\n", (((gpie >> 11) & 0x7)+1)*4); + +//#ifdef RSC_EN // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us // Recommended value based on 7.3.2.1.1 - WriteGpie(IxgbeDriver::RSC_DELAY << 11); - ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d\n", (IxgbeDriver::RSC_DELAY + 1) * 4); -#endif + //WriteGpie(IxgbeDriver::RSC_DELAY << 11); + //ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d\n", (IxgbeDriver::RSC_DELAY + 1) * 4); +//#endif /* FreeBSD: * ixgbe_common.c - s32 ixgbe_init_rx_addrs_generic(struct ixgbe_hw *hw) @@ -1770,6 +1946,11 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+1, 0x3020100); WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x3020100); + } else if(ncore == 6) { + WriteReta(i, 0x03020100); + WriteReta(i+1, 0x01000504); + WriteReta(i+2, 0x05040302); + WriteReta(i+3, 0x04030201); } else if(ncore == 8) { WriteReta(i, 0x3020100); WriteReta(i+1, 0x7060504); @@ -1800,7 +1981,7 @@ void ebbrt::IxgbeDriver::Init() { WriteRdrxctlRSCFRSTSIZE(~(0x1F << 17)); // s/w set to 0 WriteRdrxctl(0x1 << 1); // CRCStrip //WriteHlreg0(0x1 << 1); // CRCStrip - WriteHlreg0(0x2FFF); // CRCStrip + WriteHlreg0(0x2FFF); // CRCStrip, Enable Jumbo Packets - Linux Default WriteRdrxctl(0x1 << 25); // RSCACKC s/w set to 1 WriteRdrxctl(0x1 << 26); // FCOE_WRFIX s/w set to 1 @@ -1847,8 +2028,10 @@ void ebbrt::IxgbeDriver::Init() { WriteMflcn(0x1 << 3); // end DCB off, VT off - // TODO Enable Jumbo Packets - + // MAXFRS + WriteMaxfrs(1518 << 16); + //WriteMaxfrs(4096 << 16); + // disable relaxed ordering /*for (auto i = 0; i < 128; i++) { WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); // Txdesc_Wbro @@ -1903,7 +2086,11 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); // program srrctl register - WriteSrrctl_1(i, 0x2000403); + //WriteSrrctl_1(i, 0x2000403); // 3KB + //WriteSrrctl_1(i, 0x2000410); // 16KB + //WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (IxgbeDriver::RXBUFSZ / 1024)); // desctype adv 001b, BSIZEHEADER = 0x7 * 64B, BSIZEPACKET= 0x4 * 1 KB + WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (3072 / 1024)); + /*WriteSrrctlZero(i); WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket WriteSrrctl_1(i, (128 / 64) << 8); // bsizeheader @@ -1921,20 +2108,27 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { #ifdef RSC_EN // RSC set up - WriteRscctl(i, 0x3 << 2); // MAXDESC - WriteRscctl(i, 0x1); // RSCEN + /**** + Maximum descriptors per Large receive as follow: + 00b = Maximum of 1 descriptor per large receive. + 01b = Maximum of 4 descriptors per large receive. + 10b = Maximum of 8 descriptors per large receive. + 11b = Maximum of 16 descriptors per large receive. + + (3:2) MAXDESC * SRRCTL.BSIZEPKT must not exceed 64 KB minus one, which is the + maximum total length in the IP header and must be larger than the expected + received MSS + *****/ + + WriteRscctl(i, 0x1 | (0x11 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB + WritePsrtype(i, 0x1 << 4); // 4.6.7.2.2 - PSR_type4 in PSRTYPE[n] should be set #endif // In NON-IOV, only psrtype[0] is used - if (i == 0) { - WritePsrtypeZero(0x1330); - } - //WritePsrtypeZero(i); - //WritePsrtype(i, 0x1 << 4); // Split received TCP packets after TCP header. - - - //WritePsrtype(0, 0x40001330); - + //if (i == 0) { + // WritePsrtypeZero(0x1330); + // } + // Set head and tail pointers WriteRdt_1(i, 0x0); WriteRdh_1(i, 0x0); @@ -1965,9 +2159,10 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { //WriteIvarAllocval2(qn, 0x1 << 31); } - // must be greater than rsc delay + // must be greater than rsc delay + //WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us - WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); + WriteEitr(i, (32 << 3) | IXGBE_EITR_CNT_WDIS); // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. // The hardware setting for interrupts 16...63 is always auto clear. @@ -2002,7 +2197,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { auto tail = ixgmq[i]->rx_tail_; // update buffer address for descriptor -/*#ifdef RSC_EN +/*#ifdef RSC_EN rdesc_adv_rf_t* tmp; tmp = reinterpret_cast(&(ixgmq[i]->rx_ring_[tail])); @@ -2022,7 +2217,8 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { auto rxphys = reinterpret_cast((ixgmq[i]->circ_buffer_[NRXDESCS-1])->MutData()); ixgmq[i]->rx_ring_[ixgmq[i]->rx_tail_].buffer_address = rxphys; - + + asm volatile("sfence" ::: "memory"); // bump tail pts via register rdt to enable descriptor fetching by setting to // length of ring minus one WriteRdt_1(i, ixgmq[i]->rx_tail_); @@ -2051,6 +2247,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteTdlen(i, ixgmq[i]->tx_size_bytes_); #ifdef TX_HEAD_WB + ebbrt::kprintf_force("TX_HEAD_WB Enabled\n"); WriteTdwbal(i, (ixgmq[i]->txhwbaddr_ & 0xFFFFFFFF) | 0x1); WriteTdwbah(i, (ixgmq[i]->txhwbaddr_ >> 32) & 0xFFFFFFFF); #endif @@ -2080,89 +2277,395 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { //WriteDcaTxctrlTxdescWbro(i, ~(0x1 << 11)); } -// after packet received, need to make sure device can reuse -void ebbrt::IxgbeDriverRep::ReclaimRx() { - for (size_t i = 0; i < ixgmq_.rsc_chain_.size(); i++) { - // bump tail ptr - ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; - auto n = ixgmq_.rsc_chain_[i].first; - - // reset buffer - ixgmq_.rx_ring_[n].raw[0] = 0; - ixgmq_.rx_ring_[n].raw[1] = 0; - // allocate new rx buffer - ixgmq_.circ_buffer_[n] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); - auto rxphys = - reinterpret_cast((ixgmq_.circ_buffer_[n])->MutData()); - // update buffer with new adder - ixgmq_.rx_ring_[n].buffer_address = rxphys; - } -} - -// keep check for new packets to receive -// may wait for RSC to be done -uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, - uint64_t* rxflag, bool* process_rsc, - uint32_t* rnt, uint32_t* rxhead) { -//#ifdef RSC_EN - rdesc_adv_wb_t* tmp; - tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); - - // if no rx packets ready - if (!(tmp->dd)) { - return 0; - } - - auto rsccnt = tmp->rsccnt; - - // makes sure all reads are finished before - asm volatile("lfence":::"memory"); - - //ebbrt::kprintf("rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); +// Packet receive interrupt handler +void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t plen, i; //, i, ntc; + uint64_t rxflag; + rdesc_adv_wb_t* rx_desc; + uint32_t mcore = static_cast(Cpu::GetMine()); - // not RSC, handled normally - if (rsccnt == 0 && tmp->eop && tmp->dd) { - *len = tmp->pkt_len; - - /* set rx flags */ - // TCP/UDP checksum - if (tmp->l4i) { - *rxflag |= RXFLAG_L4CS; - if (!(tmp->l4e)) { - *rxflag |= RXFLAG_L4CS_VALID; - } + i = 0; + while(i < 64) { + rxflag = 0x0; + rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + + // Linux's ixgbe: + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back + // + asm volatile("" ::: "memory"); + + // if no rx packets ready + if (!(rx_desc->dd)) { + return; + } + + // return buffers to hardware + if(ixgmq_.cleaned_count > IXGBE_RX_BUFFER_WRITE) { + //ebbrt::kprintf_force("START c=%u cleaned_count=%u head=%u tail=%u ", mcore, ixgmq_.cleaned_count, + // ixgmq_.rx_head_, ixgmq_.rx_tail_); + + while(ixgmq_.cleaned_count) { + // reset buffer + ixgmq_.rx_ring_[ixgmq_.rx_tail_].raw[0] = 0x0; + ixgmq_.rx_ring_[ixgmq_.rx_tail_].raw[1] = 0x0; + + // allocate new rx buffer + ixgmq_.circ_buffer_[ixgmq_.rx_tail_] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_tail_])->MutData()); + // update descriptor with new buffer adder + ixgmq_.rx_ring_[ixgmq_.rx_tail_].buffer_address = rxphys; + + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + ixgmq_.cleaned_count --; + } + + ixgmq_.cleaned_count = 0; + + // Force memory writes to complete before letting h/w + //know there are new descriptors to fetch. (Only + // applicable for weak-ordered memory model archs, + // such as IA-64). + // + wmb(); + WriteRdt_1(mcore, ixgmq_.rx_tail_); + + //ebbrt::kprintf_force("END c=%u cleaned_count=%u head=%u tail=%u\n", mcore, ixgmq_.cleaned_count, +// ixgmq_.rx_head_, ixgmq_.rx_tail_); } + + // handle a single receive + if(rx_desc->eop) + { + ixgmq_.rx_desc_counts[1] ++; + + //if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) + plen = rx_desc->pkt_len; + if(!plen) return; // Linux's ixgbe driver checks this case + + // TCP/UDP checksum + if (rx_desc->l4i) { + rxflag |= RXFLAG_L4CS; + if (!(rx_desc->l4e)) { + rxflag |= RXFLAG_L4CS_VALID; + } + } + // Ipv4 checksum + if (rx_desc->ipcs) { + rxflag |= RXFLAG_IPCS; + if (!(rx_desc->ipe)) { + rxflag |= RXFLAG_IPCS_VALID; + } + } - // Ipv4 checksum - if (tmp->ipcs) { - *rxflag |= RXFLAG_IPCS; - if (!(tmp->ipe)) { - *rxflag |= RXFLAG_IPCS_VALID; - } + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); + ixgmq_.cleaned_count ++; + i ++; + + //ebbrt::kprintf_force("ReceivePoll() core=%u, head=%u tail=%u plen=%u\n", +// mcore, ixgmq_.rx_head_, ixgmq_.rx_tail_, plen); + // update next rx descriptor to process + //ntc = ixgmq_.rx_head_ + 1; + //ntc = (ntc < ixgmq_.rx_size_) ? ntc : 0; + //ixgmq_.rx_head_ = ntc; + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + // TODO add _mm_prefetch from Linux?? + root_.itf_.Receive(std::move(b), rxflag); } + else + { + uint32_t rsc_count = 0; + + //RSC FIRST PACKET + plen = rx_desc->pkt_len; + if(!plen) { + ebbrt::kabort("**** RSC first packet Abort: core %u pkt_len == 0.\n", mcore); + return; + } + + //ebbrt::kprintf("\nRSC desc=%d next_desc=%d len=%d eop=%d\n", ixgmq_.rx_head_, rx_desc->next_descriptor_ptr, plen, rx_desc->eop); + + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); + ixgmq_.cleaned_count ++; + i ++; + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + rsc_count ++; + + while(true) { + rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + retry: + // Linux's ixgbe: + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back + // + asm volatile("" ::: "memory"); + + // if no rx packets ready + if (!(rx_desc->dd)) { + goto retry; + ebbrt::kprintf_force("**** RSC Abort: core %u rx_desc->dd == 0\n", mcore); + //return; + //mb(); + //rmb(); + //wmb(); + } + + plen = rx_desc->pkt_len; + if(!plen) { + ebbrt::kabort("***** RSC middle packetsAbort: core %u pkt_len == 0.\n", mcore); + return; + } + + // setup rx buffers + ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); + b->PrependChain(std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_])); + ixgmq_.cleaned_count ++; + i ++; - *rxhead = ixgmq_.rx_head_; - //ebbrt::kprintf("\t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); + //ebbrt::kprintf("\nRSC desc=%d next_desc=%d len=%d eop=%d\n", ixgmq_.rx_head_, rx_desc->next_descriptor_ptr, plen, rx_desc->eop); + + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + + rsc_count ++; + if(rx_desc->eop) { + rxflag = 0; + // TCP/UDP checksum + if (rx_desc->l4i) { + rxflag |= RXFLAG_L4CS; + if (!(rx_desc->l4e)) { + rxflag |= RXFLAG_L4CS_VALID; + } + } + // Ipv4 checksum + if (rx_desc->ipcs) { + rxflag |= RXFLAG_IPCS; + if (!(rx_desc->ipe)) { + rxflag |= RXFLAG_IPCS_VALID; + } + } + + ixgmq_.rx_desc_counts[rsc_count] ++; + //if(b->ComputeChainDataLength() > 256) { + // auto p1 = reinterpret_cast(b->MutData()); + // for (int i = 0; i < 248; i+=8) { + // ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + // } + //} + //ebbrt::kprintf("RSC len=%u rsc_count=%d\n", b->ComputeChainDataLength(), rsc_count); + break; + } + } + + root_.itf_.Receive(std::move(b), rxflag); + } - //ebbrt::kprintf("\t rx_head=%d rsccnt=%d len=%d rss_type=0x%X rss_hash=0x%X\n", *rxhead, rsccnt, tmp->pkt_len, tmp->rss_type, tmp->rss_hash); + } + } + +/*void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t len; + uint64_t bAddr; + uint64_t rxflag; + bool process_rsc; + uint32_t count; + uint32_t rnt; + uint32_t rxhead; + process_rsc = false; + rxflag = 0; + count = 0; + rnt = 0; + uint32_t mcore = static_cast(Cpu::GetMine()); +#ifdef STATS_EN + ixgmq_.stat_num_recv ++; +#endif + + // while there are still packets received + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { + // hit last rsc context, start to process all buffers + if (process_rsc) { + } else { + count ++; + + //ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d\n", mcore, len, rxhead); + +#ifdef STATS_EN + ixgmq_.stat_num_rx_bytes += len; +#endif - // reset descriptor - //ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; - //ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; + ixgmq_.circ_buffer_[rxhead]->SetLength(len); + auto b = std::move(ixgmq_.circ_buffer_[rxhead]); - // bump head ptr - ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + // bump tail ptr + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + root_.itf_.Receive(std::move(b), rxflag); + + // reset buffer + ixgmq_.rx_ring_[rxhead].raw[0] = 0; + ixgmq_.rx_ring_[rxhead].raw[1] = 0; + // allocate new rx buffer + ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); + // update buffer with new adder + ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; + + wmb(); + WriteRdt_1(mcore, ixgmq_.rx_tail_); + } + } + }*/ + +ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) + : root_(root), ixgq_(root_.GetQueue()), + ixgmq_(root.GetMultiQueue(Cpu::GetMine())), + receive_callback_([this]() { ReceivePoll(); }) { + //this->ReceivePoll(); + ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + + ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StartTimer() { + auto timeout = std::chrono::seconds(1); + timer->Start(*this, timeout, true); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { + timer->Stop(*this); +} + +void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { + uint32_t mcore = static_cast(Cpu::GetMine()); + + ixgmq_.perfCycles.Stop(); + ixgmq_.perfInst.Stop(); + ixgmq_.perfLLC_miss.Stop(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Stop(); + } + // accumulate counters + ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); + ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); + ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); + if(mcore == 0 || mcore == 1) { + ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); + //ebbrt::kprintf_force("Core %u: Fire() cycles=%llu ins=%llu llc=%llu energy=%.2lfJ\n", mcore, ixgmq_.totalCycles, ixgmq_.totalIns, ixgmq_.totalLLCmisses, ixgmq_.totalNrg); + } + + ixgmq_.perfCycles.Clear(); + ixgmq_.perfInst.Clear(); + ixgmq_.perfLLC_miss.Clear(); + + ixgmq_.perfCycles.Start(); + ixgmq_.perfInst.Start(); + ixgmq_.perfLLC_miss.Start(); + if(mcore == 0 || mcore == 1) { + ixgmq_.powerMeter.Start(); + } + ixgmq_.fireCount += 1; + //ebbrt::kprintf_force("Core %u: Fire() %llu\n", mcore, ixgmq_.fireCount); +} + +uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); + return reg & 0xFFFF; +} +uint16_t ebbrt::IxgbeDriverRep::ReadRdt_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x01018 + 0x40 * n); + return reg & 0xFFFF; +} + +void ebbrt::IxgbeDriverRep::WriteRdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x01018 + 0x40 * n, m); +} + +void ebbrt::IxgbeDriverRep::Run() { + while (1) { + ReceivePoll(); + } +} +void ebbrt::IxgbeDriverRep::WriteTdt_1(uint32_t n, uint32_t m) { + root_.bar0_.Write32(0x06018 + 0x40 * n, m); +} + +// 8.2.3.5.9 Extended Interrupt Mask Clear Registers — EIMC[n] +// (0x00AB0 + 4*(n-1), n=1...2; WO) +void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { + auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); + root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); +} + +// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) +void ebbrt::IxgbeDriverRep::WriteEimc(uint32_t m) { root_.bar0_.Write32(0x00888, m); } + +// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) +void ebbrt::IxgbeDriverRep::WriteEims(uint32_t m) { root_.bar0_.Write32(0x00880, m); } + +uint32_t ebbrt::IxgbeDriverRep::ReadTdh_1(uint32_t n) { + auto reg = root_.bar0_.Read32(0x06010 + 0x40 * n); + return reg & 0xFFFF; +} +uint32_t ebbrt::IxgbeDriverRep::ReadTdt_1(uint32_t n) { + return root_.bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; +} + + /* + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); + auto p1 = reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_head_])->MutData()); + int i=0; + ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + + rdesc_adv_wb_t* tmp2; + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + p1 = reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_head_+1])->MutData()); + i = 0; + ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); return 1; - } + */ + /*else if (rsccnt > 0 && tmp->eop) { + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); + rdesc_adv_wb_t* tmp2; + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+2])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+2, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+3])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+3, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + + } else { + ebbrt::kabort("%s rsccnt > 0 && !(tmp->eop) \n", __FUNCTION__); + }*/ + + /* // not sure what case this is, no context started, eop is set but rsccnt > 0 else if (rsccnt > 0 && tmp->eop && !(ixgmq_.rsc_used)) { kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, "RSC: NEXTP > RX_SIZE\n"); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); + rdesc_adv_wb_t* tmp2; + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+2])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+2, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+3])); + ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+3, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); + *len = tmp->pkt_len; - /* set rx flags */ + // set rx flags // TCP/UDP checksum if (tmp->l4i) { *rxflag |= RXFLAG_L4CS; @@ -2193,6 +2696,8 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, "RSC: NEXTP > RX_SIZE\n"); + ebbrt::kprintf_force("GetRxBuf NEW RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); + ixgmq_.rsc_used = true; ixgmq_.rsc_chain_.clear(); ixgmq_.rsc_chain_.emplace_back( @@ -2207,6 +2712,8 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, "RSC: NEXTP > RX_SIZE\n"); + ebbrt::kprintf_force("GetRxBuf Append RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); + ixgmq_.rsc_chain_.emplace_back( std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); @@ -2217,9 +2724,11 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, } // LAST RSC CONTEXT else if (rsccnt > 0 && tmp->eop && ixgmq_.rsc_used) { - ixgmq_.rsc_used = false; + ixgmq_.rsc_used = false; - /* set rx flags */ + ebbrt::kprintf_force("GetRxBuf Last RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); + + // // TCP/UDP checksum if (tmp->l4i) { *rxflag |= RXFLAG_L4CS; @@ -2248,134 +2757,9 @@ uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, } else { // shouldn't hit here ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); - } - -/*#else - // no RSC so just get one packet at a time - rdesc_legacy_t tmp; - tmp = ixgmq_.rx_ring_[ixgmq_.rx_head_]; - - if (tmp.dd && tmp.eop) { - *len = tmp.length; - - // set rx flags - // TCP/UDP checksum - if (tmp.l4cs) { - *rxflag |= RXFLAG_L4CS; - if (!(tmp.tcpe)) { - *rxflag |= RXFLAG_L4CS_VALID; - } - } - - // Ipv4 checksum - if (tmp.ipcs) { - *rxflag |= RXFLAG_IPCS; - if (!(tmp.ipe)) { - *rxflag |= RXFLAG_IPCS_VALID; - } - } - - // reset descriptor - ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[0] = 0; - ixgmq_.rx_ring_[ixgmq_.rx_head_].raw[1] = 0; - - // bump head ptr - ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; -#ifdef STATS_EN - ixgmq_.stat_num_rx ++; -#endif - return 0; - } - #endif*/ - - return 0; -} - -void ebbrt::IxgbeDriverRep::ReceivePoll() { - uint32_t len; - uint64_t bAddr; - uint64_t rxflag; - bool process_rsc; - uint32_t count; - uint32_t rnt; - uint32_t rxhead; - process_rsc = false; - rxflag = 0; - count = 0; - rnt = 0; - uint32_t mcore = static_cast(Cpu::GetMine()); -#ifdef STATS_EN - ixgmq_.stat_num_recv ++; -#endif - - /*if(ixgmq_.time_send > 0) { - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - uint64_t endt = std::chrono::duration_cast(d).count(); - uint64_t idlet = endt - ixgmq_.time_send; - uint64_t idlet_mod = (idlet / 50) * 50; - - auto got = ixgmq_.idle_times_.find(idlet_mod); - // not found - if(got == ixgmq_.idle_times_.end()) - ixgmq_.idle_times_[idlet_mod] = 1; - else - ixgmq_.idle_times_[idlet_mod] ++; - - ixgmq_.time_idle_min = idlet < ixgmq_.time_idle_min ? idlet : ixgmq_.time_idle_min; - ixgmq_.time_idle_max = idlet > ixgmq_.time_idle_max ? idlet : ixgmq_.time_idle_max; - ixgmq_.total_idle_time += idlet; - }*/ - - // while there are still packets received - while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { - // hit last rsc context, start to process all buffers - if (process_rsc) { - process_rsc = false; - count++; - - auto n = ixgmq_.rsc_chain_[0].first; - uint32_t rsclen = 0; - - // TODO hack - need to set actual length of data else there'll be 0's - // attached - ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[0].second); - - rsclen += ixgmq_.rsc_chain_[0].second; - - // TODO - maybe find better way to rewrite this - auto b = std::move(ixgmq_.circ_buffer_[n]); - - for (size_t x = 1; x < ixgmq_.rsc_chain_.size(); x++) { - count++; - - auto n = ixgmq_.rsc_chain_[x].first; - // TODO hack - need to set actual length of data - ixgmq_.circ_buffer_[n]->SetLength(ixgmq_.rsc_chain_[x].second); - rsclen += ixgmq_.rsc_chain_[x].second; - b->PrependChain(std::move(ixgmq_.circ_buffer_[n])); - } - - ReclaimRx(); - - /*if (len > 60) { - ebbrt::kprintf("\t RSC on core: %d len=%u\n", mcore, rsclen); - }*/ - root_.itf_.Receive(std::move(b), rxflag); - } else { - count ++; - - //ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d\n", mcore, len, rxhead); - -#ifdef STATS_EN - ixgmq_.stat_num_rx_bytes += len; -#endif - - ixgmq_.circ_buffer_[rxhead]->SetLength(len); - auto b = std::move(ixgmq_.circ_buffer_[rxhead]); - - // bump tail ptr - ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + } */ + //ebbrt::kprintf("\t Core: %d ReceivePoll() len=%d rxhead=%d num_chains=%d *** \n\n", mcore, len, rxhead, b->CountChainElements()); /*if (len > 60) { ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d START\n", mcore, len, rxhead); @@ -2396,19 +2780,9 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { /*if (len > 60) { ebbrt::kprintf("\t ReceivePoll on core: %d len=%u\n", mcore, len); }*/ - root_.itf_.Receive(std::move(b), rxflag); + //ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); + - // reset buffer - ixgmq_.rx_ring_[rxhead].raw[0] = 0; - ixgmq_.rx_ring_[rxhead].raw[1] = 0; - // allocate new rx buffer - ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); - auto rxphys = - reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); - // update buffer with new adder - ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; - WriteRdt_1(mcore, ixgmq_.rx_tail_); - /*// done with buffer addr above, now to reuse it auto tail = ixgmq_.rx_tail_; @@ -2451,106 +2825,265 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { root_.itf_.Receive(std::move(b), rxflag); }*/ - } - } - // TODO: Update tail register here or above? -// if (count > 0) { - // update reg - // WriteRdt_1(mcore, ixgmq_.rx_tail_); - //} -} + /*if(ixgmq_.time_send > 0) { + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + uint64_t endt = std::chrono::duration_cast(d).count(); + uint64_t idlet = endt - ixgmq_.time_send; + uint64_t idlet_mod = (idlet / 50) * 50; -ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) - : root_(root), ixgq_(root_.GetQueue()), - ixgmq_(root.GetMultiQueue(Cpu::GetMine())), - receive_callback_([this]() { ReceivePoll(); }) { - //this->ReceivePoll(); - ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); - ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); - ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); - - ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); -} + auto got = ixgmq_.idle_times_.find(idlet_mod); + // not found + if(got == ixgmq_.idle_times_.end()) + ixgmq_.idle_times_[idlet_mod] = 1; + else + ixgmq_.idle_times_[idlet_mod] ++; -void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StartTimer() { - auto timeout = std::chrono::seconds(1); - timer->Start(*this, timeout, true); -} + ixgmq_.time_idle_min = idlet < ixgmq_.time_idle_min ? idlet : ixgmq_.time_idle_min; + ixgmq_.time_idle_max = idlet > ixgmq_.time_idle_max ? idlet : ixgmq_.time_idle_max; + ixgmq_.total_idle_time += idlet; + }*/ -void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { - timer->Stop(*this); -} +// keep check for new packets to receive +// may wait for RSC to be done +// uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, +// uint64_t* rxflag, bool* process_rsc, +// uint32_t* rnt, uint32_t* rxhead) { +// rdesc_adv_wb_t* tmp; +// tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); +// uint32_t i; +// //uint32_t mcore = static_cast(Cpu::GetMine()); -void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { - uint32_t mcore = static_cast(Cpu::GetMine()); +// // if no rx packets ready +// if (!(tmp->dd)) { +// return 0; +// } - ixgmq_.perfCycles.Stop(); - ixgmq_.perfInst.Stop(); - ixgmq_.perfLLC_miss.Stop(); - if(mcore == 0 || mcore == 1) { - ixgmq_.powerMeter.Stop(); - } - // accumulate counters - ixgmq_.totalCycles += static_cast(ixgmq_.perfCycles.Read()); - ixgmq_.totalIns += static_cast(ixgmq_.perfInst.Read()); - ixgmq_.totalLLCmisses += static_cast(ixgmq_.perfLLC_miss.Read()); - if(mcore == 0 || mcore == 1) { - ixgmq_.totalNrg += ixgmq_.powerMeter.Read(); - //ebbrt::kprintf_force("Core %u: Fire() cycles=%llu ins=%llu llc=%llu energy=%.2lfJ\n", mcore, ixgmq_.totalCycles, ixgmq_.totalIns, ixgmq_.totalLLCmisses, ixgmq_.totalNrg); - } - - ixgmq_.perfCycles.Clear(); - ixgmq_.perfInst.Clear(); - ixgmq_.perfLLC_miss.Clear(); +// //auto rsccnt = tmp->rsccnt; + +// // makes sure all reads are finished before +// asm volatile("lfence":::"memory"); + +// //ebbrt::kprintf("rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); - ixgmq_.perfCycles.Start(); - ixgmq_.perfInst.Start(); - ixgmq_.perfLLC_miss.Start(); - if(mcore == 0 || mcore == 1) { - ixgmq_.powerMeter.Start(); - } -} +// // not RSC, handled normally +// // if (rsccnt == 0 && tmp->eop && tmp->dd) { +// if (tmp->eop && tmp->dd) { +// *len = tmp->pkt_len; + +// /* set rx flags */ +// // TCP/UDP checksum +// if (tmp->l4i) { +// *rxflag |= RXFLAG_L4CS; +// if (!(tmp->l4e)) { +// *rxflag |= RXFLAG_L4CS_VALID; +// } +// } + +// // Ipv4 checksum +// if (tmp->ipcs) { +// *rxflag |= RXFLAG_IPCS; +// if (!(tmp->ipe)) { +// *rxflag |= RXFLAG_IPCS_VALID; +// } +// } + +// *rxhead = ixgmq_.rx_head_; +// //ebbrt::kprintf("** GetRxBuf START **\n \t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", ixgmq_.rx_head_, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); +// //ebbrt::kprintf("\t rx_head=%d rsccnt=%d len=%d rss_type=0x%X rss_hash=0x%X\n", *rxhead, rsccnt, tmp->pkt_len, tmp->rss_type, tmp->rss_hash); + +// // bump head ptr +// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; + +// return 1; +// } else if(!(tmp->eop) && tmp->dd) { +// uint32_t desc_count = 0; +// uint32_t start_header = ixgmq_.rx_head_; + +// /*** RSC FIRST PACKET ***/ +// ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(tmp->pkt_len); +// auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); + +// // bump head ptr +// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; +// // bump tail ptr +// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; +// desc_count ++; + +// rdesc_adv_wb_t* tmp_next; +// uint32_t next_desc = tmp->next_descriptor_ptr; +// //ebbrt::kprintf("\nRSC start_desc=%d next_desc=%d len=%d\n", start_header, next_desc, tmp->pkt_len); + +// // hopefully won't happen @@ +// if(next_desc != ixgmq_.rx_head_) { +// ebbrt::kabort("1) next_desc=%d != ixgmq_.rx_head_=%d\n", next_desc, ixgmq_.rx_head_); +// } +// tmp_next = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); + +// /*** RSC MIDDILE CHAINS ***/ +// while(!(tmp_next->eop)) { +// desc_count ++; +// ixgmq_.circ_buffer_[next_desc]->SetLength(tmp_next->pkt_len); +// b->PrependChain(std::move(ixgmq_.circ_buffer_[next_desc])); + +// // bump head ptr +// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; +// // bump tail ptr +// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + +// //ebbrt::kprintf("curr_desc=%d, next_desc=%d len=%d\n", next_desc, tmp_next->next_descriptor_ptr, tmp_next->pkt_len); + +// next_desc = tmp_next->next_descriptor_ptr; +// if(next_desc != ixgmq_.rx_head_) { +// ebbrt::kprintf("2) next_desc=%d != ixgmq_.rx_head_=%d\n", next_desc, ixgmq_.rx_head_); +// } +// tmp_next = reinterpret_cast(&(ixgmq_.rx_ring_[next_desc])); +// } + +// /*** RSC LAST PACKET ***/ +// desc_count ++; +// //ebbrt::kprintf("\t GetRxBuf() rx_head_=%d rsccnt=%d len=%d dd=%u eop=%d\n", rh, tmp2->rsccnt, tmp2->pkt_len, tmp2->dd, tmp2->eop); +// if (tmp_next->l4i) { +// *rxflag |= RXFLAG_L4CS; +// if (!(tmp_next->l4e)) { +// *rxflag |= RXFLAG_L4CS_VALID; +// } +// } +// // Ipv4 checksum +// if (tmp_next->ipcs) { +// *rxflag |= RXFLAG_IPCS; +// if (!(tmp_next->ipe)) { +// *rxflag |= RXFLAG_IPCS_VALID; +// } +// } + +// ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(tmp_next->pkt_len); +// b->PrependChain(std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_])); + +// //ebbrt::kprintf("LAST RSC desc = %d len=%d tail=%d\n\n", ixgmq_.rx_head_, tmp_next->pkt_len, ixgmq_.rx_tail_); + +// // bump head ptr +// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; +// // bump tail ptr +// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + +// // Process Packet +// root_.itf_.Receive(std::move(b), *rxflag); + +// // reset descriptors +// for (i=0;i((ixgmq_.circ_buffer_[start_header+i])->MutData()); +// // update descriptor with new buffer adder +// ixgmq_.rx_ring_[start_header+i].buffer_address = rxphys; +// } + +// // tell NIC which descriptors are free +// asm volatile("lfence" ::: "memory"); +// asm volatile("sfence" ::: "memory"); +// //WriteRdt_1(mcore, ixgmq_.rx_tail_); + +// // Clean up descriptors +// *process_rsc = true; +// return 1; +// } else { +// ebbrt::kabort("\t GetRxBuf(): Unknown RX packet descriptor\n"); +// return 0; +// } + +// /*ixgmq_.rsc_chain_.clear(); + +// ixgmq_.rsc_chain_.emplace_back( +// std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); +// //ebbrt::kprintf("*** GetRxBuf START\n \t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", ixgmq_.rx_head_, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); + +// rdesc_adv_wb_t* tmp2; +// uint32_t rh = tmp->next_descriptor_ptr; +// tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[rh])); +// while(!(tmp2->eop)) { +// //ebbrt::kprintf_force("\t GetRxBuf() rx_head_=%d rsccnt=%d len=%d dd=%u eop=%d nextp=%d \n", rh, tmp2->rsccnt, tmp2->pkt_len, tmp2->dd, tmp2->eop, tmp2->next_descriptor_ptr); +// ixgmq_.rsc_chain_.emplace_back( +// std::make_pair(rh, static_cast(tmp2->pkt_len))); +// rh = tmp2->next_descriptor_ptr; +// tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[rh])); +// } + -uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { - auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); - return reg & 0xFFFF; -} -uint16_t ebbrt::IxgbeDriverRep::ReadRdt_1(uint32_t n) { - auto reg = root_.bar0_.Read32(0x01018 + 0x40 * n); - return reg & 0xFFFF; -} + +// ixgmq_.rsc_chain_.emplace_back( +// std::make_pair(rh, static_cast(tmp2->pkt_len))); -void ebbrt::IxgbeDriverRep::WriteRdt_1(uint32_t n, uint32_t m) { - root_.bar0_.Write32(0x01018 + 0x40 * n, m); -} +// // bump head ptr +// ixgmq_.rx_head_ = (rh + 1) % ixgmq_.rx_size_; +// *process_rsc = true; +// return 1;*/ + +// return 0; +// } -void ebbrt::IxgbeDriverRep::Run() { - while (1) { - ReceivePoll(); - } -} -void ebbrt::IxgbeDriverRep::WriteTdt_1(uint32_t n, uint32_t m) { - root_.bar0_.Write32(0x06018 + 0x40 * n, m); -} +/*void ebbrt::IxgbeDriverRep::ReceivePoll() { + uint32_t len; + uint64_t bAddr; + uint64_t rxflag; + bool process_rsc; + uint32_t count; + uint32_t rnt; + uint32_t rxhead; + process_rsc = false; + rxflag = 0; + count = 0; + rnt = 0; + //uint32_t mcore = static_cast(Cpu::GetMine()); +#ifdef STATS_EN + ixgmq_.stat_num_recv ++; +#endif + + // while there are still packets received + while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { + // hit last rsc context, start to process all buffers + if (process_rsc) { + } + else { + count ++; -// 8.2.3.5.9 Extended Interrupt Mask Clear Registers — EIMC[n] -// (0x00AB0 + 4*(n-1), n=1...2; WO) -void ebbrt::IxgbeDriverRep::WriteEimcn(uint32_t n, uint32_t m) { - auto reg = root_.bar0_.Read32(0x00AB0 + 4 * n); - root_.bar0_.Write32(0x00AB0 + 4 * n, reg | m); -} +#ifdef STATS_EN + ixgmq_.stat_num_rx_bytes += len; +#endif + + ixgmq_.circ_buffer_[rxhead]->SetLength(len); + auto b = std::move(ixgmq_.circ_buffer_[rxhead]); -// 8.2.3.5.4 Extended Interrupt Mask Clear Register- EIMC (0x00888; WO) -void ebbrt::IxgbeDriverRep::WriteEimc(uint32_t m) { root_.bar0_.Write32(0x00888, m); } + // bump tail ptr + ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; + //ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); + + root_.itf_.Receive(std::move(b), rxflag); + + // reset buffer + ixgmq_.rx_ring_[rxhead].raw[0] = 0; + ixgmq_.rx_ring_[rxhead].raw[1] = 0; + // allocate new rx buffer + ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); + auto rxphys = + reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); + // update buffer with new adder + ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; -// 8.2.3.5.3 Extended Interrupt Mask Set/Read Register- EIMS (0x00880; RWS) -void ebbrt::IxgbeDriverRep::WriteEims(uint32_t m) { root_.bar0_.Write32(0x00880, m); } + asm volatile("lfence" ::: "memory"); + asm volatile("sfence" ::: "memory"); + //WriteRdt_1(mcore, ixgmq_.rx_tail_); + + } + } -uint32_t ebbrt::IxgbeDriverRep::ReadTdh_1(uint32_t n) { - auto reg = root_.bar0_.Read32(0x06010 + 0x40 * n); - return reg & 0xFFFF; -} -uint32_t ebbrt::IxgbeDriverRep::ReadTdt_1(uint32_t n) { - return root_.bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; -} + // TODO: Update tail register here or above? +// if (count > 0) { + // update reg + // WriteRdt_1(mcore, ixgmq_.rx_tail_); + //} + }*/ diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index ee4c8506..9ee9f1f5 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -22,11 +22,12 @@ #include "Rapl.h" // Receive Side Scaling (RSC) enabled -//#define RSC_EN +#define RSC_EN // Direct Cache Access (DCA) enabled //#define DCA_ENABLE // Transmit Header Writeback enabled //#define TX_HEAD_WB +//#define JUMBO_EN // Collect Statistics Flag #define STATS_EN @@ -38,15 +39,15 @@ namespace ebbrt { // Per-core receive and transmit queue typedef struct { rdesc_legacy_t* rx_ring; - size_t rx_head; - size_t rx_tail; - size_t rx_size; + uint32_t rx_head; + uint32_t rx_tail; + uint32_t rx_size; tdesc_legacy_t* tx_ring; uint32_t* tx_head; - size_t tx_tail; - size_t tx_last_tail; - size_t tx_size; + uint32_t tx_tail; + uint32_t tx_last_tail; + uint32_t tx_size; bool* tx_isctx; // buffers holding packet data @@ -82,6 +83,10 @@ class IxgbeDriver : public EthernetDevice { //void Run(); void Send(std::unique_ptr buf, PacketInfo pinfo) override; + //void SendUdp(std::unique_ptr buf, uint64_t len) override; + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len) override; + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len) override; + void Config(std::string s, uint32_t v) override; std::string ReadNic() override; const EthernetAddress& GetMacAddress() override; @@ -111,15 +116,19 @@ class IxgbeDriver : public EthernetDevice { #endif // Linux Defaults - static const constexpr uint32_t RXBUFSZ = 2048; + //static const constexpr uint32_t RXBUFSZ = 2048; + //static const constexpr uint32_t RXBUFSZ = 8192; static const constexpr uint32_t BSIZEHEADER = 256; - - //static const constexpr uint32_t RXBUFSZ = 4096; + + static const constexpr uint32_t RXBUFSZ = 4092; + //static const constexpr uint32_t RXBUFSZ = 8192; //static const constexpr uint32_t RXBUFSZ = 16384; - static const constexpr uint8_t ITR_INTERVAL = 8; - // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us - static const constexpr uint8_t RSC_DELAY = 1; + // 8 bits (3 - 11) in (ITR_INTERVAL * 2 us) + static const constexpr uint8_t ITR_INTERVAL = 32; + + // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us + static const constexpr uint8_t RSC_DELAY = 7; // DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0x10; @@ -128,7 +137,7 @@ class IxgbeDriver : public EthernetDevice { // Class with per core queue data structures class e10Kq { public: - e10Kq(size_t idx, Nid nid) + e10Kq(uint32_t idx, Nid nid) : rx_head_(0), rx_tail_(0), rx_size_(NRXDESCS), tx_tail_(0), tx_last_tail_(0), tx_size_(NTXDESCS), idx_(idx), rxflag_(0), rsc_used(false), hanc{0} { @@ -214,22 +223,30 @@ class IxgbeDriver : public EthernetDevice { "rx_size_bytes_ not 128 byte aligned\n"); ebbrt::kbugon((tx_size_bytes_ & 0x7F) != 0, "tx_size_bytes_ not 128 byte aligned\n"); + + tx_desc_counts.reserve(100); + rx_desc_counts.reserve(100); + for(int i=0;i<100;i++) { + tx_desc_counts.emplace_back(0); + rx_desc_counts.emplace_back(0); + } } - size_t rx_head_; - size_t rx_tail_; - size_t rx_size_; - size_t tx_tail_; - size_t tx_last_tail_; - size_t tx_size_; - size_t idx_; - size_t rx_size_bytes_; - size_t tx_size_bytes_; + uint32_t rx_head_; + uint32_t rx_tail_; + uint32_t rx_size_; + uint32_t tx_tail_; + uint32_t tx_last_tail_; + uint32_t tx_size_; + uint32_t idx_; + uint32_t rx_size_bytes_; + uint32_t tx_size_bytes_; uint64_t rxaddr_; uint64_t txaddr_; uint64_t txhwbaddr_; uint64_t rxflag_; - + uint64_t cleaned_count{0}; + std::vector> circ_buffer_; std::vector> rsc_chain_; std::unordered_map idle_times_; @@ -248,7 +265,7 @@ class IxgbeDriver : public EthernetDevice { #ifdef TX_HEAD_WB uint32_t* tx_head_; #else - size_t tx_head_; + uint32_t tx_head_; #endif // stats @@ -265,8 +282,11 @@ class IxgbeDriver : public EthernetDevice { uint64_t totalCycles{0}; uint64_t totalIns{0}; uint64_t totalLLCmisses{0}; + uint64_t fireCount{0}; uint32_t rapl_val{666}; uint32_t itr_val{8}; + std::vector tx_desc_counts; + std::vector rx_desc_counts; double totalNrg{0.0}; double totalTime{0.0}; double totalPower{0.0}; @@ -463,7 +483,8 @@ class IxgbeDriver : public EthernetDevice { void WriteMngtxmap(uint32_t m); void WriteRxfeccerr0(uint32_t m); - + void WriteMaxfrs(uint32_t m); + uint8_t ReadRdrxctlDmaidone(); void ReadEicr(); @@ -505,7 +526,7 @@ class IxgbeDriver : public EthernetDevice { void DumpStats(); e10k_queue_t& GetQueue() const { return *ixgq; } - e10Kq& GetMultiQueue(size_t index) const { return *ixgmq[index]; } + e10Kq& GetMultiQueue(uint32_t index) const { return *ixgmq[index]; } pci::Device& dev_; pci::Bar& bar0_; @@ -532,6 +553,10 @@ class IxgbeDriverRep : public MulticoreEbb, Timer:: void ReclaimTx(); void ReclaimRx(); void Send(std::unique_ptr buf, PacketInfo pinfo); + void SendUdp(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + void SendTCPChained(std::unique_ptr buf, uint64_t len, uint64_t num_chains, PacketInfo pinfo); + void SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + //void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, // enum l4_type l4type); //void AddTx(uint64_t pa, uint64_t len, uint64_t totallen, bool first, diff --git a/src/native/Net.cc b/src/native/Net.cc index cb0afda9..d0617c0c 100644 --- a/src/native/Net.cc +++ b/src/native/Net.cc @@ -36,17 +36,12 @@ void ebbrt::NetworkManager::Interface::Receive(std::unique_ptr buf, break; } default: { - ebbrt::kprintf("NetworkManager::Interface::Receive(): Unknown eth_header.type=0x%X packet_len=%u\n", ntohs(eth_header.type), packet_len); - ebbrt::kabort("NetworkManager::Interface::Receive()\n"); - /*for (int i = 0; i < (int)packet_len; i+=8) { - if (i+8 < (int)packet_len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } else { - for(int j = i; j < (int)packet_len; j++) { - ebbrt::kprintf("%02X\n", p1[j]); - } - } - }*/ + //ebbrt::kprintf("NetworkManager::Interface::Receive(): Unknown eth_header.type=0x%X packet_len=%u\n", ntohs(eth_header.type), packet_len); + /*auto p1 = reinterpret_cast(buf->MutData()); + for (int i = 0; i < 256; i+=8) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } + ebbrt::kabort("NetworkManager::Interface::Receive()\n");*/ } } } diff --git a/src/native/NetTcp.cc b/src/native/NetTcp.cc index af6a64e8..358c3bc4 100644 --- a/src/native/NetTcp.cc +++ b/src/native/NetTcp.cc @@ -1148,7 +1148,8 @@ void ebbrt::NetworkManager::TcpEntry::SendSegment(TcpSegment& segment) { pinfo.tcp_len = len - pinfo.tcp_hdr_len; // XXX: Actually store the MSS instead of making this assumption - size_t mss = 1460; + //size_t mss = 1460; + size_t mss = 2048; if (segment.tcp_len > mss) { pinfo.gso_type = PacketInfo::kGsoTcpv4; pinfo.hdr_len = segment.th.HdrLen(); diff --git a/src/native/Newlib.cc b/src/native/Newlib.cc index 7110c825..f800e66f 100644 --- a/src/native/Newlib.cc +++ b/src/native/Newlib.cc @@ -88,7 +88,7 @@ extern "C" int ebbrt_newlib_fstat(int file, struct stat* st) { return 0; } -extern "C" int ebbrt_newlib_stat(const char* file, struct stat* st) { +extern "C" int ebbrt_newlib_stat(const char* file, struct stat* st) { EBBRT_UNIMPLEMENTED(); return 0; } @@ -158,6 +158,181 @@ extern "C" int ebbrt_newlib_gettimeofday(struct timeval* p, void* z) { return 0; } +extern "C" int ebbrt_newlib_fcntl(int s, int cmd) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" char* ebbrt_newlib_getcwd(char *buf, size_t size) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" int ebbrt_newlib_dup(int oldfd) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_gettime () +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_settime() +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_clock_getres() +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_closedir(DIR *d) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_opendir(const char* c) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_getppid() { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" struct dirent * ebbrt_newlib_readdir(DIR *d) +{ + EBBRT_UNIMPLEMENTED(); + return NULL; +} + +extern "C" int ebbrt_newlib_pipe (int *fd) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_sched_yield() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_umask () { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" int ebbrt_newlib_symlink(const char *path1, const char *path2) +{ + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_rmdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_mkdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_chdir(const char *path) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" char* ebbrt_newlib_ttyname(int) { + EBBRT_UNIMPLEMENTED(); + return NULL; +} + +extern "C" int ebbrt_newlib_fdatasync(int) { + EBBRT_UNIMPLEMENTED(); + return 0; +} + +extern "C" int ebbrt_newlib_getuid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getgid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_geteuid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getegid() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_fsync(int) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_execv (const char *path, char *const argv[]) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_chmod() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_access (const char *fn, int flags) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_utime (const char *path, char *times) { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_lstat () { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" void ebbrt_newlib_getpwnam () { + EBBRT_UNIMPLEMENTED(); +} +extern "C" void ebbrt_newlib_getpwuid () { + EBBRT_UNIMPLEMENTED(); +} + +extern "C" int ebbrt_newlib_select () { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getrusage() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_getrlimit() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + +extern "C" int ebbrt_newlib_setrlimit() { + EBBRT_UNIMPLEMENTED(); + return -1; +} + struct RLock { static const constexpr uint32_t kNoOwner = -1; uint32_t owner; diff --git a/src/native/Newlib.h b/src/native/Newlib.h index 502d4c61..3149d96f 100644 --- a/src/native/Newlib.h +++ b/src/native/Newlib.h @@ -14,7 +14,8 @@ extern "C" { typedef void* _LOCK_T; typedef void* _LOCK_RECURSIVE_T; - +typedef struct __dirstream DIR; + extern void ebbrt_newlib_lock_init(_LOCK_T*); extern void ebbrt_newlib_lock_init_recursive(_LOCK_RECURSIVE_T*); extern void ebbrt_newlib_lock_close(_LOCK_T*); @@ -48,7 +49,42 @@ extern void* ebbrt_newlib_realloc(void*, size_t); extern void* ebbrt_newlib_calloc(size_t, size_t); extern void* ebbrt_newlib_memalign(size_t, size_t); extern int ebbrt_newlib_gettimeofday(struct timeval *, void *); - +extern int ebbrt_newlib_fcntl(int , int); +extern char* ebbrt_newlib_getcwd(char * , size_t); +extern int ebbrt_newlib_dup(int); +extern int ebbrt_newlib_clock_gettime(); +extern int ebbrt_newlib_clock_settime(); +extern int ebbrt_newlib_clock_getres(); +extern int ebbrt_newlib_closedir (DIR *); +extern int ebbrt_newlib_opendir (const char*); +extern void ebbrt_newlib_getppid(); +extern struct dirent * ebbrt_newlib_readdir(DIR *d); +extern int ebbrt_newlib_pipe (int *); +extern int ebbrt_newlib_sched_yield(); +extern void ebbrt_newlib_umask (); +extern int ebbrt_newlib_symlink(const char *path1, const char *path2); +extern int ebbrt_newlib_rmdir(const char *path); +extern int ebbrt_newlib_mkdir(const char *path); +extern int ebbrt_newlib_chdir(const char *path); +extern char* ebbrt_newlib_ttyname(int); +extern int ebbrt_newlib_fdatasync(int); +extern int ebbrt_newlib_getuid(); +extern int ebbrt_newlib_getgid(); +extern int ebbrt_newlib_geteuid(); +extern int ebbrt_newlib_getegid(); +extern int ebbrt_newlib_fsync(int); +extern int ebbrt_newlib_execv (const char *path, char *const argv[]); +extern int ebbrt_newlib_chmod(); +extern int ebbrt_newlib_access (const char *fn, int flags); +extern int ebbrt_newlib_utime (const char *path, char *times); +extern int lstat (); +extern void ebbrt_newlib_getpwnam (); +extern void ebbrt_newlib_getpwuid (); +extern int ebbrt_newlib_select (); +extern int ebbrt_newlib_getrusage(); +extern int ebbrt_newlib_getrlimit(); +extern int ebbrt_newlib_setrlimit(); + #ifdef __cplusplus } #endif diff --git a/src/native/Perf.cc b/src/native/Perf.cc index c07edd6d..b62ad115 100644 --- a/src/native/Perf.cc +++ b/src/native/Perf.cc @@ -150,14 +150,14 @@ ebbrt::perf::PerfCounter::PerfCounter(ebbrt::perf::PerfEvent evt) : evt_{evt} { if (((pmcs >> i) & 0x1) == 0) { pmc_num_ = i; pmcs |= (0x1u << i); - kprintf("DEBUG#%d %x \n", pmc_num_, pmcs); + //kprintf("DEBUG#%d %x \n", pmc_num_, pmcs); perfevtsel.usermode = 1; perfevtsel.osmode = 1; perfevtsel.en = 1; ebbrt::msr::Write(kIa32PerfEvtSelMsr(pmc_num_), perfevtsel.val); counter_offset_ = ebbrt::msr::Read(kIa32Pmc(pmc_num_)); - kprintf("Perf counter #%d initialized to evt=%u\n", pmc_num_, - static_cast(evt_)); + //kprintf("Perf counter #%d initialized to evt=%u\n", pmc_num_, + // static_cast(evt_)); return; } } diff --git a/toolchain/patches/newlib-2.0.0.patch b/toolchain/patches/newlib-2.0.0.patch index 51c9e675..71c3194b 100644 --- a/toolchain/patches/newlib-2.0.0.patch +++ b/toolchain/patches/newlib-2.0.0.patch @@ -116,7 +116,7 @@ new file mode 100644 index 0000000..68b7fbe --- /dev/null +++ b/newlib/libc/sys/ebbrt/syscalls.c -@@ -0,0 +1,145 @@ +@@ -0,0 +1,304 @@ +#include +#include +#include @@ -262,3 +262,162 @@ index 0000000..68b7fbe + return ebbrt_newlib_gettimeofday(p, z); +} + ++int fcntl(int s, int cmd, ...) ++{ ++ return ebbrt_newlib_fcntl(s, cmd); ++} ++ ++char* getcwd(char *buf, size_t size) ++{ ++ return ebbrt_newlib_getcwd(buf, size); ++} ++ ++int dup(int oldfd) ++{ ++ return ebbrt_newlib_dup(oldfd); ++} ++ ++int clock_gettime (clockid_t clock_id, struct timespec *tp) ++{ ++ return ebbrt_newlib_clock_gettime(); ++} ++int clock_settime (clockid_t clock_id, const struct timespec *tp) ++{ ++ return ebbrt_newlib_clock_settime(); ++} ++int clock_getres (clockid_t clock_id, struct timespec *res) ++{ ++ return ebbrt_newlib_clock_getres(); ++} ++ ++int closedir (DIR *d) ++{ ++ return ebbrt_newlib_closedir(d); ++} ++ ++DIR *opendir(const char * c) ++{ ++ return ebbrt_newlib_opendir(c); ++} ++ ++pid_t getppid() ++{ ++ ebbrt_newlib_getppid(); ++ return -1; ++} ++ ++struct dirent * readdir(DIR *d) ++{ ++ return ebbrt_newlib_readdir(d); ++} ++ ++int pipe (int *fd) ++{ ++ return ebbrt_newlib_pipe(fd); ++} ++ ++int sched_yield () ++{ ++ return ebbrt_newlib_sched_yield(); ++} ++ ++mode_t umask (mode_t mask) ++{ ++ ebbrt_newlib_umask(); ++ return -1; ++} ++ ++int symlink(const char *path1, const char *path2) ++{ ++ return ebbrt_newlib_symlink(path1, path2); ++} ++ ++int rmdir(const char *path) ++{ ++ return ebbrt_newlib_rmdir(path); ++} ++int mkdir (const char *path, mode_t mode) ++{ ++ return ebbrt_newlib_mkdir(path); ++} ++ ++int chdir (const char *path) ++{ ++ return ebbrt_newlib_chdir(path); ++} ++ ++char* ttyname(int fd) ++{ ++ return ebbrt_newlib_ttyname(fd); ++} ++int fdatasync(int fd) ++{ ++ return ebbrt_newlib_fdatasync(fd); ++} ++uid_t getuid() { ++ return ebbrt_newlib_getuid(); ++} ++ ++uid_t getgid(void) { ++ return ebbrt_newlib_getgid(); ++} ++ ++uid_t geteuid() { ++ return ebbrt_newlib_geteuid(); ++} ++ ++uid_t getegid(void) { ++ return ebbrt_newlib_getegid(); ++} ++ ++int fsync (int fd) ++{ ++ return ebbrt_newlib_fsync(fd); ++} ++int execv (const char *path, char *const argv[]) ++{ ++ return ebbrt_newlib_execv (path, argv); ++} ++int chmod (const char *path, mode_t mode) ++{ ++ return ebbrt_newlib_chmod(); ++} ++int access (const char *fn, int flags) ++{ ++ return ebbrt_newlib_access(fn, flags); ++} ++int utime (const char *path, char *times) ++{ ++ return ebbrt_newlib_utime(path, times); ++} ++int lstat (const char *__restrict pathname, struct stat *__restrict pstat) ++{ ++ return ebbrt_newlib_lstat(); ++} ++struct passwd* getpwnam (const char *name) ++{ ++ ebbrt_newlib_getpwnam(); ++ return NULL; ++} ++struct passwd* getpwuid (uid_t uid) ++{ ++ ebbrt_newlib_getpwuid(); ++ return NULL; ++} ++int select (int n, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, struct timeval *timeout) ++{ ++ return ebbrt_newlib_select(); ++} ++int getrusage(int who, struct rusage *rusage) ++{ ++ return ebbrt_newlib_getrusage(); ++} ++ ++int getrlimit(int resource, struct rlimit* rlim) ++{ ++ return ebbrt_newlib_getrlimit(); ++} ++int setrlimit(int resource, const struct rlimit* rlim) ++{ ++ return ebbrt_newlib_setrlimit(); ++} From 4dd77060f7c0be890ae3cfe1b5e1ce3fdc8db13d Mon Sep 17 00:00:00 2001 From: Han Date: Wed, 8 Apr 2020 18:59:42 -0400 Subject: [PATCH 16/20] remove some counters --- src/native/IxgbeDriver.cc | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index ff81f87b..4c245119 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -669,7 +669,7 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { // hardware limits sending over 40 descriptors per packet, have to manually coalesce here // hopefully not too often if(num_chains > 38) { - ixgmq_.tx_desc_counts[39] ++; + //ixgmq_.tx_desc_counts[39] ++; //ebbrt::kprintf_force("*** num_chains=%d > 38\n", num_chains); b = MakeUniqueIOBuf(len); auto mdata = b->MutData(); @@ -681,10 +681,10 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { SendTCPUnchained(std::move(b), len, pinfo); } else if(buf->IsChained() && num_chains <= 38) { - ixgmq_.tx_desc_counts[num_chains] ++; + //ixgmq_.tx_desc_counts[num_chains] ++; SendTCPChained(std::move(buf), len, num_chains, pinfo); } else { //Not Chained - ixgmq_.tx_desc_counts[1] ++; + //ixgmq_.tx_desc_counts[1] ++; SendTCPUnchained(std::move(buf), len, pinfo); } } @@ -2120,7 +2120,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { received MSS *****/ - WriteRscctl(i, 0x1 | (0x11 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB + WriteRscctl(i, 0x1 | (0x10 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB WritePsrtype(i, 0x1 << 4); // 4.6.7.2.2 - PSR_type4 in PSRTYPE[n] should be set #endif @@ -2279,7 +2279,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // Packet receive interrupt handler void ebbrt::IxgbeDriverRep::ReceivePoll() { - uint32_t plen, i; //, i, ntc; + uint32_t plen, i; uint64_t rxflag; rdesc_adv_wb_t* rx_desc; uint32_t mcore = static_cast(Cpu::GetMine()); @@ -2339,7 +2339,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // handle a single receive if(rx_desc->eop) { - ixgmq_.rx_desc_counts[1] ++; + //ixgmq_.rx_desc_counts[1] ++; //if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) plen = rx_desc->pkt_len; @@ -2379,7 +2379,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { } else { - uint32_t rsc_count = 0; + //uint32_t rsc_count = 0; //RSC FIRST PACKET plen = rx_desc->pkt_len; @@ -2396,7 +2396,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.cleaned_count ++; i ++; ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - rsc_count ++; + //rsc_count ++; while(true) { rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); @@ -2434,7 +2434,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - rsc_count ++; + //rsc_count ++; if(rx_desc->eop) { rxflag = 0; // TCP/UDP checksum @@ -2452,7 +2452,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { } } - ixgmq_.rx_desc_counts[rsc_count] ++; + //ixgmq_.rx_desc_counts[rsc_count] ++; //if(b->ComputeChainDataLength() > 256) { // auto p1 = reinterpret_cast(b->MutData()); // for (int i = 0; i < 248; i+=8) { @@ -2468,7 +2468,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { } } - } +} /*void ebbrt::IxgbeDriverRep::ReceivePoll() { uint32_t len; From af1c4a39fb9b062cedb950335b2c72b4acda86e5 Mon Sep 17 00:00:00 2001 From: Han Date: Mon, 13 Apr 2020 12:29:53 -0400 Subject: [PATCH 17/20] tse fixes --- src/native/IxgbeDriver.cc | 131 +++++++++++++++++++++++++------------- src/native/IxgbeDriver.h | 15 +++-- 2 files changed, 96 insertions(+), 50 deletions(-) diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 4c245119..ed38770d 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -87,34 +87,46 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ixgmq[i]->rx_desc_counts[j] = 0; ixgmq[i]->tx_desc_counts[j] = 0; } + + ixgmq[i]->time_us = 0; + ixgmq[i]->totalNrg = 0; + if(i == 0 || i == 1) { + auto d = ebbrt::clock::Wall::Now().time_since_epoch(); + ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); + ixgmq[i]->powerMeter.Start(); + } + + //ebb_->StartTimer(); //ebbrt::kprintf_force("%d: %d %d\n", i, ixgmq[i]->tx_desc_counts.size(), ixgmq[i]->rx_desc_counts.size()); /*ixgmq[i]->stat_num_recv = 0; - ixgmq[i]->time_us = 0; + ixgmq[i]->totalCycles = 0; ixgmq[i]->totalIns = 0; ixgmq[i]->totalLLCmisses = 0; - ixgmq[i]->totalNrg = 0; + ixgmq[i]->fireCount = 0; ixgmq[i]->perfCycles.Start(); ixgmq[i]->perfInst.Start(); ixgmq[i]->perfLLC_miss.Start(); + */ + + } else if(s == "stop_perf") { if(i == 0 || i == 1) { + ixgmq[i]->powerMeter.Stop(); + ixgmq[i]->totalNrg += ixgmq[i]->powerMeter.Read(); auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); - ixgmq[i]->powerMeter.Start(); + auto endt = std::chrono::duration_cast(d).count(); + ixgmq[i]->totalTime = ((double)(endt - (ixgmq[i]->time_us)) / 1000000.0); + ixgmq[i]->totalPower = ixgmq[i]->totalNrg / ixgmq[i]->totalTime; } - ebb_->StartTimer();*/ - } else if(s == "stop_perf") { - ixgmq[i]->perfCycles.Stop(); + /*ixgmq[i]->perfCycles.Stop(); ixgmq[i]->perfInst.Stop(); ixgmq[i]->perfLLC_miss.Stop(); - if(i == 0 || i == 1) { - ixgmq[i]->powerMeter.Stop(); - } + // accumulate counters ixgmq[i]->totalCycles += static_cast(ixgmq[i]->perfCycles.Read()); ixgmq[i]->totalIns += static_cast(ixgmq[i]->perfInst.Read()); @@ -134,12 +146,18 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ixgmq[i]->perfLLC_miss.Clear(); ixgmq[i]->stat_num_recv = 0; - ebb_->StopTimer(); - + ebb_->StopTimer(); */ } else if(s == "print") { - /*uint64_t cycs, ins, llc, nints; double ttime, tnrg; - ttime = tnrg = 0.0; + ttime = tnrg = 0.0; + ttime = ixgmq[0]->totalTime > ixgmq[1]->totalTime ? ixgmq[0]->totalTime : ixgmq[1]->totalTime; + for(uint32_t i = 0; i < static_cast(Cpu::Count()); i++) { + tnrg += ixgmq[i]->totalNrg; + } + ebbrt::kprintf_force("TIME=%.2fs\n", ttime); + ebbrt::kprintf_force("WATTS=%.2f\n", tnrg/ttime); + + /*uint64_t cycs, ins, llc, nints; cycs = ins = llc = nints = 0; for(uint32_t i = 0; i < static_cast(Cpu::Count()); i++) { @@ -149,8 +167,7 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { tnrg += ixgmq[i]->totalNrg; nints += ixgmq[i]->totalInterrupts; } - ttime = ixgmq[0]->totalTime > ixgmq[1]->totalTime ? ixgmq[0]->totalTime : ixgmq[1]->totalTime; - + ixgmq[i]->str_stats.str(""); ixgmq[i]->str_stats.precision(20); ixgmq[i]->str_stats << "INSTRUCTIONS=" << ins @@ -168,11 +185,10 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ebbrt::kprintf_force("CYCLES=%llu\n", cycs); ebbrt::kprintf_force("IPC=%.2f\n", (float)ins/cycs); ebbrt::kprintf_force("LLC_MISSES=%llu\n", llc); - ebbrt::kprintf_force("TIME=%.2f\n", ttime); - ebbrt::kprintf_force("WATTS=%.2f\n", tnrg/ttime); + ebbrt::kprintf_force("AVG_ITR_PER_CORE=%.2f\n", (float)nints/static_cast(Cpu::Count())); ebbrt::kprintf_force("\n");*/ - + ebbrt::kprintf_force("\n%15s %15s %15s\n", "num_desc","tx","rx"); for(int j=0;j<40;j++) { uint32_t sumt = 0; uint32_t sumr = 0; @@ -180,7 +196,7 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { sumt += ixgmq[c]->tx_desc_counts[j]; sumr += ixgmq[c]->rx_desc_counts[j]; } - ebbrt::kprintf_force("%d,%d,%d\n", j, sumt, sumr); + ebbrt::kprintf_force("%15d %15d %15d\n", j, sumt, sumr); } //ebbrt::kprintf_force("nrg=%.2lf J\n", tnrg); //ebbrt::kprintf_force("ttime=%.2f s time1=%.2f s time2=%.2f s\n", ttime, ixgmq[0]->totalTime, ixgmq[1]->totalTime); @@ -227,6 +243,7 @@ void ebbrt::IxgbeDriverRep::ReclaimTx() { // (IxgbeDriver::NTXDESCS - 1): 340 W, 1599820.2, eax=0x60 if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { + //if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { auto head = ixgmq_.tx_head_; auto tail = ixgmq_.tx_tail_; @@ -600,7 +617,7 @@ void ebbrt::IxgbeDriverRep::SendTCPChained(std::unique_ptr buf, uint64_t i = 0; for (auto& buf_it : *buf) { if (buf_it.Length() > IXGBE_MAX_DATA_PER_TXD) { - ebbrt::kprintf("TSE buf_it.Length() = %u > IXGBE_MAX_DATA_PER_TXD\n", buf_it.Length()); + ebbrt::kprintf("TSE buf_it.Length() = %u > IXGBE_MAX_DATA_PER_TXD num_chains=%llu i=%llu\n", buf_it.Length(), num_chains, i); return; } @@ -660,16 +677,16 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { num_chains = buf->CountChainElements(); ReclaimTx(); - + +#ifdef RSC_EN // if no IP/TCP checksum - likely UDP packet if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { SendUdp(std::move(buf), len, pinfo); } else { // TCP Packet - // hardware limits sending over 40 descriptors per packet, have to manually coalesce here // hopefully not too often if(num_chains > 38) { - //ixgmq_.tx_desc_counts[39] ++; + ixgmq_.tx_desc_counts[39] ++; //ebbrt::kprintf_force("*** num_chains=%d > 38\n", num_chains); b = MakeUniqueIOBuf(len); auto mdata = b->MutData(); @@ -680,14 +697,35 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { //data = reinterpret_cast(b->MutData()); SendTCPUnchained(std::move(b), len, pinfo); - } else if(buf->IsChained() && num_chains <= 38) { - //ixgmq_.tx_desc_counts[num_chains] ++; + } else if(num_chains > 1 && num_chains <= 38) { + ixgmq_.tx_desc_counts[num_chains] ++; SendTCPChained(std::move(buf), len, num_chains, pinfo); } else { //Not Chained - //ixgmq_.tx_desc_counts[1] ++; + ixgmq_.tx_desc_counts[1] ++; SendTCPUnchained(std::move(buf), len, pinfo); } - } + } +#else + // if no IP/TCP checksum - likely UDP packet + if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { + SendUdp(std::move(buf), len, pinfo); + } else { // TCP Packet + ixgmq_.tx_desc_counts[1] ++; + + if(num_chains > 1) { + b = MakeUniqueIOBuf(len); + auto mdata = b->MutData(); + for (auto& buf_it : *buf) { + memcpy(mdata, buf_it.Data(), buf_it.Length()); + mdata += buf_it.Length(); + } + SendTCPUnchained(std::move(b), len, pinfo); + } else { + SendTCPUnchained(std::move(buf), len, pinfo); + } + } +#endif + //ebbrt::kprintf("\t Send() core=%u head=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_tail_, free_desc); asm volatile("sfence" ::: "memory"); //ebbrt::kprintf("\t Send() core=%u head=%u last_tail=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_last_tail_, ixgmq_.tx_tail_, free_desc); @@ -1805,7 +1843,7 @@ void ebbrt::IxgbeDriver::Init() { /* setup msix */ // switch to msix mode - //WriteGpie(0x1 << 4); // Multiple_MSIX + WriteGpie(0x1 << 4 | 0x1 << 5 | 0x1 << 31 | 0x1 << 30); // Multiple_MSIX //WriteGpie(0x1 << 5); // OCD //WriteGpie(0x1 << 31); // PBA_support // Enable auto masking of interrupt @@ -1814,10 +1852,10 @@ void ebbrt::IxgbeDriver::Init() { // TODO: Set up management interrupt handler //WriteGpie(0xC0000036); - uint32_t gpie = 0xC0000036 | (0x7 << 11); + //uint32_t gpie = 0xC0000036 | (0x7 << 11); //uint32_t gpie = 0xC0000036 | (IxgbeDriver::RSC_DELAY << 11); - WriteGpie(gpie); - ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d us\n", (((gpie >> 11) & 0x7)+1)*4); + //WriteGpie(gpie); + //ebbrt::kprintf_force("RSC enabled, RSC_DELAY = %d us\n", (((gpie >> 11) & 0x7)+1)*4); //#ifdef RSC_EN // TODO: RSC delay value, just a guess at (1 + 1) * 4us = 8 us @@ -2086,10 +2124,10 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { WriteRdlen_1(i, ixgmq[i]->rx_size_bytes_); // program srrctl register - //WriteSrrctl_1(i, 0x2000403); // 3KB + WriteSrrctl_1(i, 0x2000402); // 3KB //WriteSrrctl_1(i, 0x2000410); // 16KB //WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (IxgbeDriver::RXBUFSZ / 1024)); // desctype adv 001b, BSIZEHEADER = 0x7 * 64B, BSIZEPACKET= 0x4 * 1 KB - WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (3072 / 1024)); + //WriteSrrctl_1(i, (0x1 << 25) | (0x4 << 8) | (3072 / 1024)); /*WriteSrrctlZero(i); WriteSrrctl_1(i, RXBUFSZ / 1024); // bsizepacket @@ -2120,7 +2158,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { received MSS *****/ - WriteRscctl(i, 0x1 | (0x10 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB + WriteRscctl(i, 0x1 | (0x11 << 2)); // RSCEN=1, MAXDESC= (0x1) * SRRCTL.BSIZEPACKET < 64KB WritePsrtype(i, 0x1 << 4); // 4.6.7.2.2 - PSR_type4 in PSRTYPE[n] should be set #endif @@ -2160,9 +2198,9 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { } // must be greater than rsc delay - //WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); + WriteEitr(i, (IxgbeDriver::ITR_INTERVAL << 3) | IXGBE_EITR_CNT_WDIS); // WriteEitr(i, 0x80 << 3); // 7 * 2us = 14 us - WriteEitr(i, (32 << 3) | IXGBE_EITR_CNT_WDIS); + //WriteEitr(i, (32 << 3) | IXGBE_EITR_CNT_WDIS); // 7.3.1.4 - Note that there are no EIAC(1)...EIAC(2) registers. // The hardware setting for interrupts 16...63 is always auto clear. @@ -2284,7 +2322,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { rdesc_adv_wb_t* rx_desc; uint32_t mcore = static_cast(Cpu::GetMine()); - i = 0; + i = 0; while(i < 64) { rxflag = 0x0; rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); @@ -2339,7 +2377,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // handle a single receive if(rx_desc->eop) { - //ixgmq_.rx_desc_counts[1] ++; + ixgmq_.rx_desc_counts[1] ++; //if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) plen = rx_desc->pkt_len; @@ -2379,7 +2417,12 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { } else { - //uint32_t rsc_count = 0; +#ifndef RSC_EN + ebbrt::kprintf_force("RSC packet in non-RSC mode\n"); + return; +#endif + + uint32_t rsc_count = 0; //RSC FIRST PACKET plen = rx_desc->pkt_len; @@ -2396,7 +2439,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.cleaned_count ++; i ++; ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - //rsc_count ++; + rsc_count ++; while(true) { rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); @@ -2434,7 +2477,7 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - //rsc_count ++; + rsc_count ++; if(rx_desc->eop) { rxflag = 0; // TCP/UDP checksum @@ -2451,8 +2494,8 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { rxflag |= RXFLAG_IPCS_VALID; } } - - //ixgmq_.rx_desc_counts[rsc_count] ++; + + ixgmq_.rx_desc_counts[rsc_count] ++; //if(b->ComputeChainDataLength() > 256) { // auto p1 = reinterpret_cast(b->MutData()); // for (int i = 0; i < 248; i+=8) { diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 9ee9f1f5..ae0e1206 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -22,7 +22,7 @@ #include "Rapl.h" // Receive Side Scaling (RSC) enabled -#define RSC_EN +//#define RSC_EN // Direct Cache Access (DCA) enabled //#define DCA_ENABLE // Transmit Header Writeback enabled @@ -113,26 +113,29 @@ class IxgbeDriver : public EthernetDevice { #else static const constexpr uint32_t NTXDESCS = 512; static const constexpr uint32_t NRXDESCS = 512; + //static const constexpr uint32_t NTXDESCS = 4096; + //static const constexpr uint32_t NRXDESCS = 4096; #endif // Linux Defaults - //static const constexpr uint32_t RXBUFSZ = 2048; + static const constexpr uint32_t RXBUFSZ = 2048; //static const constexpr uint32_t RXBUFSZ = 8192; static const constexpr uint32_t BSIZEHEADER = 256; - static const constexpr uint32_t RXBUFSZ = 4092; + //static const constexpr uint32_t RXBUFSZ = 4096; //static const constexpr uint32_t RXBUFSZ = 8192; //static const constexpr uint32_t RXBUFSZ = 16384; // 8 bits (3 - 11) in (ITR_INTERVAL * 2 us) - static const constexpr uint8_t ITR_INTERVAL = 32; + //static const constexpr uint8_t ITR_INTERVAL = 32; + static const constexpr uint8_t ITR_INTERVAL = 8; // 3 bits only (0 - 7) in (RSC_DELAY + 1) * 4 us static const constexpr uint8_t RSC_DELAY = 7; // DMA Tx TCP Max Allow Size Requests — DTXMXSZRQ - static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0x10; - //static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0xFFF; + //static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0x10; + static const constexpr uint16_t MAX_BYTES_NUM_REQ = 0xFFF; // Class with per core queue data structures class e10Kq { From bb616b8d6dd7bbf39577593eb56d4aea6c57f54c Mon Sep 17 00:00:00 2001 From: Han Date: Thu, 30 Jul 2020 17:01:38 -0400 Subject: [PATCH 18/20] Added per interrupt logging mechanism --- src/native/EventManager.cc | 18 +- src/native/Ixgbe.h | 1 + src/native/IxgbeDriver.cc | 969 ++++++++----------------------------- src/native/IxgbeDriver.h | 108 +++-- src/native/Rapl.h | 15 +- src/native/Uart8250.cc | 2 +- 6 files changed, 287 insertions(+), 826 deletions(-) diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index 97de8cb5..705a0cc4 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -164,15 +164,15 @@ void ebbrt::EventManager::Process() { :: "a" ((void*)&flags), "c" (ecx), "d"(edx)); // https://elixir.bootlin.com/linux/v4.15.1/source/arch/x86/include/asm/mwait.h#L100 - ecx = 1; - - //eax = 0x20; - //eax = 0x60; - - // C1E state - //eax = 0x1; - - // C7 state + // https://elixir.bootlin.com/linux/v5.5.1/source/drivers/idle/intel_idle.c + + // sandy bridge + // C1 0x00 + // C1E 0x01 + // C3 0x10 + // C6 0x20 + // C7 0x30 + ecx = 1; /* break on interrupt flag */ eax = 0x30; asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" diff --git a/src/native/Ixgbe.h b/src/native/Ixgbe.h index 6e281acb..8ef9c07a 100644 --- a/src/native/Ixgbe.h +++ b/src/native/Ixgbe.h @@ -39,6 +39,7 @@ // max transmit sizes #define IXGBE_MAX_TXD_PWR 14 #define IXGBE_MAX_DATA_PER_TXD (1u << IXGBE_MAX_TXD_PWR) +#define IXGBE_TSO_LIMIT 262144 enum l4_type { l4_type_udp = 0, l4_type_tcp, l4_type_sctp, l4_type_rsv }; diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index ed38770d..5ae00bea 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -19,22 +19,9 @@ #include #include -void dumpPacketContents(uint8_t* p1, uint64_t len) { - uint64_t i, j; - - ebbrt::kprintf_force("dumpPacketContents() len=%u\n", len); - for (i = 0; i < len; i+=8) { - if (i+8 < len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } else { - for(j=i;j bsendbufs[16]; void ebbrt::IxgbeDriver::Create(pci::Device& dev) { auto ixgbe_dev = new IxgbeDriver(dev); @@ -60,6 +47,27 @@ void ebbrt::IxgbeDriver::Create(pci::Device& dev) { // TODO remove? ebbrt::clock::SleepMilli(200); ebbrt::kprintf("82599 initialze complete\n"); + + memset(ixgbe_stats, 0, sizeof(ixgbe_stats)); + memset(ixgbe_logs, 0, sizeof(ixgbe_logs)); + + uint32_t ncores = static_cast(ebbrt::Cpu::Count()); + for (uint32_t i = 0; i < ncores; i++) { + ebbrt::Promise p; + auto f = p.GetFuture(); + ebbrt::event_manager->SpawnRemote( + [i, &p] () mutable { + ixgbe_logs[i] = (union IxgbeLogEntry *)malloc(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry)); + memset(ixgbe_logs[i], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); + + bsendbufs[i] = MakeUniqueIOBuf(IXGBE_TSO_LIMIT); + memset(bsendbufs[i]->MutData(), 0, IXGBE_TSO_LIMIT); + ebbrt::kprintf_force("i=%d sizeof=%u ixgbe_addr=%p bsendbufs_addr=%p\n", i, + sizeof(ixgbe_logs[i]), (void*)ixgbe_logs[i], (void*)(bsendbufs[i]->Data())); + p.SetValue(); + }, i); + f.Block(); + } } const ebbrt::EthernetAddress& ebbrt::IxgbeDriver::GetMacAddress() { @@ -71,155 +79,6 @@ std::string ebbrt::IxgbeDriver::ReadNic() { return ixgmq[i]->str_stats.str(); } -void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { - uint32_t i = static_cast(Cpu::GetMine()); - if(s == "rx_usecs") { - ixgmq[i]->itr_val = v; - ebbrt::kprintf_force("rx-usecs = %u\n", ixgmq[i]->itr_val*2); - WriteEitr(i, (ixgmq[i]->itr_val << 3) | IXGBE_EITR_CNT_WDIS); - } else if(s == "rapl") { - if(i == 0 || i == 1) { - ixgmq[i]->powerMeter.SetLimit(v); - } - } else if(s == "start_perf") { - ebbrt::kprintf_force("start_perf %d\n", i); - for(int j=0;j < 100;j++) { - ixgmq[i]->rx_desc_counts[j] = 0; - ixgmq[i]->tx_desc_counts[j] = 0; - } - - ixgmq[i]->time_us = 0; - ixgmq[i]->totalNrg = 0; - if(i == 0 || i == 1) { - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - ixgmq[i]->time_us = std::chrono::duration_cast(d).count(); - ixgmq[i]->powerMeter.Start(); - } - - //ebb_->StartTimer(); - - //ebbrt::kprintf_force("%d: %d %d\n", i, ixgmq[i]->tx_desc_counts.size(), ixgmq[i]->rx_desc_counts.size()); - /*ixgmq[i]->stat_num_recv = 0; - - ixgmq[i]->totalCycles = 0; - ixgmq[i]->totalIns = 0; - ixgmq[i]->totalLLCmisses = 0; - - ixgmq[i]->fireCount = 0; - - ixgmq[i]->perfCycles.Start(); - ixgmq[i]->perfInst.Start(); - ixgmq[i]->perfLLC_miss.Start(); - - */ - - } else if(s == "stop_perf") { - if(i == 0 || i == 1) { - ixgmq[i]->powerMeter.Stop(); - ixgmq[i]->totalNrg += ixgmq[i]->powerMeter.Read(); - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - auto endt = std::chrono::duration_cast(d).count(); - ixgmq[i]->totalTime = ((double)(endt - (ixgmq[i]->time_us)) / 1000000.0); - ixgmq[i]->totalPower = ixgmq[i]->totalNrg / ixgmq[i]->totalTime; - } - - /*ixgmq[i]->perfCycles.Stop(); - ixgmq[i]->perfInst.Stop(); - ixgmq[i]->perfLLC_miss.Stop(); - - // accumulate counters - ixgmq[i]->totalCycles += static_cast(ixgmq[i]->perfCycles.Read()); - ixgmq[i]->totalIns += static_cast(ixgmq[i]->perfInst.Read()); - ixgmq[i]->totalLLCmisses += static_cast(ixgmq[i]->perfLLC_miss.Read()); - ixgmq[i]->totalInterrupts = ixgmq[i]->stat_num_recv; - - if(i == 0 || i == 1) { - ixgmq[i]->totalNrg += ixgmq[i]->powerMeter.Read(); - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - auto endt = std::chrono::duration_cast(d).count(); - ixgmq[i]->totalTime = ((double)(endt - (ixgmq[i]->time_us)) / 1000000.0); - //ixgmq[i]->totalPower = ixgmq[i]->totalNrg / ixgmq[i]->totalTime; - //ebbrt::kprintf_force("Core %u: cycles=%llu ins=%llu llc=%llu energy=%.2lfJ totalTime=%.2f secs Power (Watts): %.2lf\n", i, ixgmq[i]->totalCycles, ixgmq[i]->totalIns, ixgmq[i]->totalLLCmisses, ixgmq[i]->totalNrg, totalTime, ); - } - ixgmq[i]->perfCycles.Clear(); - ixgmq[i]->perfInst.Clear(); - ixgmq[i]->perfLLC_miss.Clear(); - ixgmq[i]->stat_num_recv = 0; - - ebb_->StopTimer(); */ - } else if(s == "print") { - double ttime, tnrg; - ttime = tnrg = 0.0; - ttime = ixgmq[0]->totalTime > ixgmq[1]->totalTime ? ixgmq[0]->totalTime : ixgmq[1]->totalTime; - for(uint32_t i = 0; i < static_cast(Cpu::Count()); i++) { - tnrg += ixgmq[i]->totalNrg; - } - ebbrt::kprintf_force("TIME=%.2fs\n", ttime); - ebbrt::kprintf_force("WATTS=%.2f\n", tnrg/ttime); - - /*uint64_t cycs, ins, llc, nints; - cycs = ins = llc = nints = 0; - - for(uint32_t i = 0; i < static_cast(Cpu::Count()); i++) { - cycs += ixgmq[i]->totalCycles; - ins += ixgmq[i]->totalIns; - llc += ixgmq[i]->totalLLCmisses; - tnrg += ixgmq[i]->totalNrg; - nints += ixgmq[i]->totalInterrupts; - } - - ixgmq[i]->str_stats.str(""); - ixgmq[i]->str_stats.precision(20); - ixgmq[i]->str_stats << "INSTRUCTIONS=" << ins - << " CYCLES=" << cycs - << " IPC=" << (float)ins/cycs - << " LLC_MISSES=" << llc - << " TIME=" << ttime - << " WATTS=" << tnrg/ttime - << " AVG_ITR_PER_CORE=" << (float)nints/static_cast(Cpu::Count()) - << " ITR=" << ixgmq[i]->itr_val * 2 - << " RAPL=" << ixgmq[i]->rapl_val; - - - ebbrt::kprintf_force("INSTRUCTIONS=%llu\n", ins); - ebbrt::kprintf_force("CYCLES=%llu\n", cycs); - ebbrt::kprintf_force("IPC=%.2f\n", (float)ins/cycs); - ebbrt::kprintf_force("LLC_MISSES=%llu\n", llc); - - ebbrt::kprintf_force("AVG_ITR_PER_CORE=%.2f\n", (float)nints/static_cast(Cpu::Count())); - ebbrt::kprintf_force("\n");*/ - ebbrt::kprintf_force("\n%15s %15s %15s\n", "num_desc","tx","rx"); - for(int j=0;j<40;j++) { - uint32_t sumt = 0; - uint32_t sumr = 0; - for(uint32_t c = 0; c < static_cast(Cpu::Count()); c++) { - sumt += ixgmq[c]->tx_desc_counts[j]; - sumr += ixgmq[c]->rx_desc_counts[j]; - } - ebbrt::kprintf_force("%15d %15d %15d\n", j, sumt, sumr); - } - //ebbrt::kprintf_force("nrg=%.2lf J\n", tnrg); - //ebbrt::kprintf_force("ttime=%.2f s time1=%.2f s time2=%.2f s\n", ttime, ixgmq[0]->totalTime, ixgmq[1]->totalTime); - - } else if(s == "start_idle") { - ixgmq[i]->time_send = 0; - ixgmq[i]->time_idle_min = 999999; - ixgmq[i]->time_idle_max = 0; - ixgmq[i]->total_idle_time = 0; - ixgmq[i]->stat_num_recv = 0; - ixgmq[i]->idle_times_.clear(); - - } else if(s == "stop_idle") { - ebbrt::kprintf_force("Core %u: idle_min=%llu idle_max=%llu stat_num_recv=%llu avg_idle=%.2lf\n", i, ixgmq[i]->time_idle_min, ixgmq[i]->time_idle_max, ixgmq[i]->stat_num_recv, (double)ixgmq[i]->total_idle_time/ixgmq[i]->stat_num_recv); - /*for(const auto& n : ixgmq[i]->idle_times_) { - ebbrt::kprintf_force("%u: %u\n", n.first, n.second); - }*/ - } else { - ebbrt::kprintf_force("%s Unknown command: %s\n", __PRETTY_FUNCTION__, s); - - } -} - void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { ebb_->Send(std::move(buf), std::move(pinfo)); } @@ -227,9 +86,9 @@ void ebbrt::IxgbeDriver::Send(std::unique_ptr buf, PacketInfo pinfo) { // After packet transmission, need to mark bit in // tx queue so that it can be used again // TX_HEAD_WB does it automatically -void ebbrt::IxgbeDriverRep::ReclaimTx() { - // with TX head writeback, shouldn't need to poll anymore (right?) +void ebbrt::IxgbeDriverRep::ReclaimTx() { #ifndef TX_HEAD_WB + // with TX head writeback, shouldn't need to poll anymore (right?) tdesc_advance_tx_wbf_t* awbfx; uint32_t free_desc; @@ -243,7 +102,6 @@ void ebbrt::IxgbeDriverRep::ReclaimTx() { // (IxgbeDriver::NTXDESCS - 1): 340 W, 1599820.2, eax=0x60 if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { - //if(free_desc < (IxgbeDriver::NTXDESCS - 1)) { auto head = ixgmq_.tx_head_; auto tail = ixgmq_.tx_tail_; @@ -313,14 +171,16 @@ void ebbrt::IxgbeDriverRep::SendUdp(std::unique_ptr buf, uint64_t len, Pa } -void ebbrt::IxgbeDriverRep::SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { +//void ebbrt::IxgbeDriverRep::SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo) { +void ebbrt::IxgbeDriverRep::SendTCPUnchained(uint64_t bdata, uint64_t len, PacketInfo pinfo) { uint64_t data, tsodata, tsolen; std::unique_ptr b; tdesc_advance_tx_rf_t* arfx; tdesc_advance_ctxt_wb_t* actx; uint32_t end; - data = reinterpret_cast(buf->Data()); + //data = reinterpret_cast(buf->Data()); + data = bdata; //ebbrt::kprintf_force("SendTCPUnchained len=%llu\n", len); if(len > IXGBE_MAX_DATA_PER_TXD) { @@ -666,35 +526,37 @@ void ebbrt::IxgbeDriverRep::SendTCPChained(std::unique_ptr buf, uint64_t void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { uint64_t len, num_chains; uint32_t mcore = static_cast(Cpu::GetMine()); - std::unique_ptr b; + //std::unique_ptr b; // On TSO, the maximum PAYLEN can be up to 2^18 - 1 len = buf->ComputeChainDataLength(); - if (len > 262144) { + ixgmq_.stat_num_tx_bytes += len; + if (len > IXGBE_TSO_LIMIT) { ebbrt::kprintf_force("\t kabort Send() len=%lld greater than TSO limit of 262144 bytes\n", len); return; } num_chains = buf->CountChainElements(); - + +#ifndef TX_HEAD_WB ReclaimTx(); +#endif -#ifdef RSC_EN +/*#ifdef RSC_EN // if no IP/TCP checksum - likely UDP packet if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { SendUdp(std::move(buf), len, pinfo); } else { // TCP Packet // hardware limits sending over 40 descriptors per packet, have to manually coalesce here // hopefully not too often + //ebbrt::kprintf_force("Send() len=%llu num_chains=%llu\n", len, num_chains); if(num_chains > 38) { ixgmq_.tx_desc_counts[39] ++; - //ebbrt::kprintf_force("*** num_chains=%d > 38\n", num_chains); b = MakeUniqueIOBuf(len); auto mdata = b->MutData(); for (auto& buf_it : *buf) { memcpy(mdata, buf_it.Data(), buf_it.Length()); mdata += buf_it.Length(); } - //data = reinterpret_cast(b->MutData()); SendTCPUnchained(std::move(b), len, pinfo); } else if(num_chains > 1 && num_chains <= 38) { @@ -705,41 +567,33 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { SendTCPUnchained(std::move(buf), len, pinfo); } } -#else +#else*/ // if no IP/TCP checksum - likely UDP packet if (!(pinfo.flags & PacketInfo::kNeedsCsum) && !(pinfo.flags & PacketInfo::kNeedsIpCsum)) { - SendUdp(std::move(buf), len, pinfo); + SendUdp(std::move(buf), len, pinfo); } else { // TCP Packet ixgmq_.tx_desc_counts[1] ++; if(num_chains > 1) { - b = MakeUniqueIOBuf(len); - auto mdata = b->MutData(); + //b = MakeUniqueIOBuf(len); + auto mdata = bsendbufs[mcore]->MutData(); for (auto& buf_it : *buf) { memcpy(mdata, buf_it.Data(), buf_it.Length()); mdata += buf_it.Length(); } - SendTCPUnchained(std::move(b), len, pinfo); + //SendTCPUnchained(std::move(b), len, pinfo); + SendTCPUnchained(reinterpret_cast(bsendbufs[mcore]->Data()), len, pinfo); + ixgmq_.stat_num_tx_desc += num_chains; } else { - SendTCPUnchained(std::move(buf), len, pinfo); + SendTCPUnchained(reinterpret_cast(buf->Data()), len, pinfo); + ixgmq_.stat_num_tx_desc += 1; + //SendTCPUnchained(std::move(buf), len, pinfo); } } -#endif +//#endif - //ebbrt::kprintf("\t Send() core=%u head=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_tail_, free_desc); - asm volatile("sfence" ::: "memory"); - //ebbrt::kprintf("\t Send() core=%u head=%u last_tail=%u tail=%u free_desc=%u\n", mcore, ixgmq_.tx_head_, ixgmq_.tx_last_tail_, ixgmq_.tx_tail_, free_desc); - - WriteTdt_1(mcore, ixgmq_.tx_tail_); - - //while(arfx->dd == 0) { - // makes sure all reads are finished before checking again - //asm volatile("lfence":::"memory"); - //} - - //auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - //ixgmq_.time_send = std::chrono::duration_cast(d).count(); - + asm volatile("sfence" ::: "memory"); + WriteTdt_1(mcore, ixgmq_.tx_tail_); } void ebbrt::IxgbeDriver::WriteRxctrl(uint32_t m) { @@ -1958,16 +1812,6 @@ void ebbrt::IxgbeDriver::Init() { // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to // hacky atm for (auto i = 0; i < 32; i += 4) { - /*if(ncore > 0) { - WriteReta(i, 0x0000000); - WriteReta(i+1, 0x0000000); - WriteReta(i+2, 0x0000000); - WriteReta(i+3, 0x0000000); - WriteReta(i, 0x03020100); - WriteReta(i+1, 0x07060504); - WriteReta(i+2, 0x0B0A0908); - WriteReta(i+3, 0x0F0E0D0C); - }*/ // all route to core 0 if(ncore == 1) { WriteReta(i, 0x0000000); @@ -1995,10 +1839,15 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x7060504); } else if(ncore == 16){ - WriteReta(i, 0x03020100); + /*WriteReta(i, 0x03020100); WriteReta(i+1, 0x07060504); WriteReta(i+2, 0x0B0A0908); - WriteReta(i+3, 0x0F0E0D0C); + WriteReta(i+3, 0x0F0E0D0C);*/ + ebbrt::kprintf_force("+++ all interrupts firing on Core 1\n"); + WriteReta(i, 0x1010101); + WriteReta(i+1, 0x1010101); + WriteReta(i+2, 0x1010101); + WriteReta(i+3, 0x1010101); } else { ebbrt::kabort("%s: Can only redirect interrupts to 16 cores\n", __FUNCTION__); } @@ -2321,16 +2170,71 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint64_t rxflag; rdesc_adv_wb_t* rx_desc; uint32_t mcore = static_cast(Cpu::GetMine()); + uint32_t icnt; + uint64_t now = 0, last = 0; + uint64_t cjoules, cins, ccyc, crefcyc, cllc; + uint64_t c3, c6, c7; + if(ixgmq_.collect_stats) { + icnt = ixgbe_stats[mcore].itr_cnt; + ixgbe_stats[mcore].itr_cnt2 ++; + + if (icnt < IXGBE_LOG_SIZE) { + //get current tsc and store it + now = ebbrt::trace::rdtsc(); + kassert(now != 0); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); + + // get last tsc + last = ixgbe_stats[mcore].itr_joules_last_tsc; + + // ~ 1 ms has passed + if ((now - last) > TSC_KHZ) { + cjoules = ixgmq_.powerMeter.ReadMsr(); + cins = ixgmq_.perfInst.Read(); + ccyc = ixgmq_.perfCycles.Read(); + crefcyc = ixgmq_.perfRefCycles.Read(); + cllc = ixgmq_.perfLLC_miss.Read(); + + c3 = ebbrt::msr::Read(0x3FC); + c6 = ebbrt::msr::Read(0x3FD); + c7 = ebbrt::msr::Read(0x3FE); + + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ninstructions), cins); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ncycles), ccyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nref_cycles), crefcyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nllc_miss), cllc); + + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c3), c3); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c6), c6); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c7), c7); + + ixgbe_stats[mcore].itr_joules_last_tsc = now; + } + + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_desc), ixgmq_.stat_num_rx_desc); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_bytes), ixgmq_.stat_num_rx_bytes); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_desc), ixgmq_.stat_num_tx_desc); + __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_bytes), ixgmq_.stat_num_tx_bytes); + + ixgmq_.stat_num_rx_bytes = 0; + ixgmq_.stat_num_rx_desc = 0; + ixgmq_.stat_num_tx_bytes = 0; + ixgmq_.stat_num_tx_desc = 0; + ixgbe_stats[mcore].itr_cnt++; + } + } + i = 0; while(i < 64) { rxflag = 0x0; rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); // Linux's ixgbe: - // This memory barrier is needed to keep us from reading - // any other fields out of the rx_desc until we know the - // descriptor has been written back + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back // asm volatile("" ::: "memory"); @@ -2341,9 +2245,6 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // return buffers to hardware if(ixgmq_.cleaned_count > IXGBE_RX_BUFFER_WRITE) { - //ebbrt::kprintf_force("START c=%u cleaned_count=%u head=%u tail=%u ", mcore, ixgmq_.cleaned_count, - // ixgmq_.rx_head_, ixgmq_.rx_tail_); - while(ixgmq_.cleaned_count) { // reset buffer ixgmq_.rx_ring_[ixgmq_.rx_tail_].raw[0] = 0x0; @@ -2369,9 +2270,6 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // wmb(); WriteRdt_1(mcore, ixgmq_.rx_tail_); - - //ebbrt::kprintf_force("END c=%u cleaned_count=%u head=%u tail=%u\n", mcore, ixgmq_.cleaned_count, -// ixgmq_.rx_head_, ixgmq_.rx_tail_); } // handle a single receive @@ -2379,7 +2277,6 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { { ixgmq_.rx_desc_counts[1] ++; - //if (cleaned_count >= IXGBE_RX_BUFFER_WRITE) plen = rx_desc->pkt_len; if(!plen) return; // Linux's ixgbe driver checks this case @@ -2402,16 +2299,13 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); ixgmq_.cleaned_count ++; + ixgmq_.stat_num_rx_desc ++; i ++; - - //ebbrt::kprintf_force("ReceivePoll() core=%u, head=%u tail=%u plen=%u\n", -// mcore, ixgmq_.rx_head_, ixgmq_.rx_tail_, plen); - // update next rx descriptor to process - //ntc = ixgmq_.rx_head_ + 1; - //ntc = (ntc < ixgmq_.rx_size_) ? ntc : 0; - //ixgmq_.rx_head_ = ntc; + ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - + ixgmq_.total_rx_bytes += b->ComputeChainDataLength(); + ixgmq_.stat_num_rx_bytes += b->ComputeChainDataLength(); + // TODO add _mm_prefetch from Linux?? root_.itf_.Receive(std::move(b), rxflag); } @@ -2444,21 +2338,16 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { while(true) { rx_desc = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); retry: - // Linux's ixgbe: - // This memory barrier is needed to keep us from reading - // any other fields out of the rx_desc until we know the - // descriptor has been written back - // + // Linux's ixgbe: + // This memory barrier is needed to keep us from reading + // any other fields out of the rx_desc until we know the + // descriptor has been written back asm volatile("" ::: "memory"); // if no rx packets ready if (!(rx_desc->dd)) { goto retry; - ebbrt::kprintf_force("**** RSC Abort: core %u rx_desc->dd == 0\n", mcore); - //return; - //mb(); - //rmb(); - //wmb(); + ebbrt::kprintf_force("**** RSC Abort: core %u rx_desc->dd == 0\n", mcore); } plen = rx_desc->pkt_len; @@ -2471,9 +2360,8 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(plen); b->PrependChain(std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_])); ixgmq_.cleaned_count ++; + ixgmq_.stat_num_rx_desc ++; i ++; - - //ebbrt::kprintf("\nRSC desc=%d next_desc=%d len=%d eop=%d\n", ixgmq_.rx_head_, rx_desc->next_descriptor_ptr, plen, rx_desc->eop); ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; @@ -2496,85 +2384,24 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { } ixgmq_.rx_desc_counts[rsc_count] ++; - //if(b->ComputeChainDataLength() > 256) { - // auto p1 = reinterpret_cast(b->MutData()); - // for (int i = 0; i < 248; i+=8) { - // ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - // } - //} - //ebbrt::kprintf("RSC len=%u rsc_count=%d\n", b->ComputeChainDataLength(), rsc_count); break; } } - + ixgmq_.total_rx_bytes += b->ComputeChainDataLength(); + ixgmq_.stat_num_rx_bytes += b->ComputeChainDataLength(); root_.itf_.Receive(std::move(b), rxflag); } } } -/*void ebbrt::IxgbeDriverRep::ReceivePoll() { - uint32_t len; - uint64_t bAddr; - uint64_t rxflag; - bool process_rsc; - uint32_t count; - uint32_t rnt; - uint32_t rxhead; - process_rsc = false; - rxflag = 0; - count = 0; - rnt = 0; - uint32_t mcore = static_cast(Cpu::GetMine()); -#ifdef STATS_EN - ixgmq_.stat_num_recv ++; -#endif - - // while there are still packets received - while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { - // hit last rsc context, start to process all buffers - if (process_rsc) { - } else { - count ++; - - //ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d\n", mcore, len, rxhead); - -#ifdef STATS_EN - ixgmq_.stat_num_rx_bytes += len; -#endif - - ixgmq_.circ_buffer_[rxhead]->SetLength(len); - auto b = std::move(ixgmq_.circ_buffer_[rxhead]); - - // bump tail ptr - ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; - root_.itf_.Receive(std::move(b), rxflag); - - // reset buffer - ixgmq_.rx_ring_[rxhead].raw[0] = 0; - ixgmq_.rx_ring_[rxhead].raw[1] = 0; - // allocate new rx buffer - ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); - auto rxphys = - reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); - // update buffer with new adder - ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; - - wmb(); - WriteRdt_1(mcore, ixgmq_.rx_tail_); - } - } - }*/ - ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) - : root_(root), ixgq_(root_.GetQueue()), - ixgmq_(root.GetMultiQueue(Cpu::GetMine())), + : root_(root), ixgmq_(root.GetMultiQueue(Cpu::GetMine())), receive_callback_([this]() { ReceivePoll(); }) { //this->ReceivePoll(); ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); - ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); - + ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); } @@ -2662,471 +2489,87 @@ uint32_t ebbrt::IxgbeDriverRep::ReadTdt_1(uint32_t n) { return root_.bar0_.Read32(0x06018 + 0x40 * n) & 0xFFFF; } - /* - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); - auto p1 = reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_head_])->MutData()); - int i=0; - ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - - rdesc_adv_wb_t* tmp2; - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - - p1 = reinterpret_cast((ixgmq_.circ_buffer_[ixgmq_.rx_head_+1])->MutData()); - i = 0; - ebbrt::kprintf_force("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - return 1; - */ - /*else if (rsccnt > 0 && tmp->eop) { - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); - rdesc_adv_wb_t* tmp2; - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+2])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+2, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+3])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+3, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - - } else { - ebbrt::kabort("%s rsccnt > 0 && !(tmp->eop) \n", __FUNCTION__); - }*/ - - /* - // not sure what case this is, no context started, eop is set but rsccnt > 0 - else if (rsccnt > 0 && tmp->eop && !(ixgmq_.rsc_used)) { - kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, - "RSC: NEXTP > RX_SIZE\n"); - - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_, rsccnt, tmp->next_descriptor_ptr, tmp->pkt_len, tmp->eop); - rdesc_adv_wb_t* tmp2; - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+1])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+1, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+2])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+2, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_+3])); - ebbrt::kprintf_force("*** GetRxBuf() rx_head_=%d rsccnt=%d nextp=%d len=%d eop=%d\n", ixgmq_.rx_head_+3, tmp2->rsccnt, tmp2->next_descriptor_ptr, tmp2->pkt_len, tmp2->eop); - - *len = tmp->pkt_len; - - // set rx flags - // TCP/UDP checksum - if (tmp->l4i) { - *rxflag |= RXFLAG_L4CS; - if (!(tmp->l4e)) { - *rxflag |= RXFLAG_L4CS_VALID; - } - } - - // Ipv4 checksum - if (tmp->ipcs) { - *rxflag |= RXFLAG_IPCS; - if (!(tmp->ipe)) { - *rxflag |= RXFLAG_IPCS_VALID; +void dumpPacketContents(uint8_t* p1, uint64_t len) { + uint64_t i, j; + + ebbrt::kprintf_force("dumpPacketContents() len=%u\n", len); + for (i = 0; i < len; i+=8) { + if (i+8 < len) { + ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); + } else { + for(j=i;j 0 && !(tmp->eop) && !(ixgmq_.rsc_used)) { - kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, - "RSC: NEXTP > RX_SIZE\n"); - - ebbrt::kprintf_force("GetRxBuf NEW RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); - - ixgmq_.rsc_used = true; - ixgmq_.rsc_chain_.clear(); - ixgmq_.rsc_chain_.emplace_back( - std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); - // bump head ptr - ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - - return 1; - } - // APPEND TO EXISTING RSC CONTEXT - else if (rsccnt > 0 && !(tmp->eop) && ixgmq_.rsc_used) { - kbugon(tmp->next_descriptor_ptr > ixgmq_.rx_size_, - "RSC: NEXTP > RX_SIZE\n"); - - ebbrt::kprintf_force("GetRxBuf Append RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); - - ixgmq_.rsc_chain_.emplace_back( - std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); - - // bump head ptr - ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - - return 1; } - // LAST RSC CONTEXT - else if (rsccnt > 0 && tmp->eop && ixgmq_.rsc_used) { - ixgmq_.rsc_used = false; - - ebbrt::kprintf_force("GetRxBuf Last RSC CONTEXT rsccnt=%d len=%d\n", rsccnt, tmp->pkt_len); - - // - // TCP/UDP checksum - if (tmp->l4i) { - *rxflag |= RXFLAG_L4CS; - if (!(tmp->l4e)) { - *rxflag |= RXFLAG_L4CS_VALID; - } - } +} - // Ipv4 checksum - if (tmp->ipcs) { - *rxflag |= RXFLAG_IPCS; - if (!(tmp->ipe)) { - *rxflag |= RXFLAG_IPCS_VALID; - } +void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { + uint32_t i = static_cast(Cpu::GetMine()); + if(s == "rx_usecs") { + ixgmq[i]->itr_val = v; + WriteEitr(i, (ixgmq[i]->itr_val << 3) | IXGBE_EITR_CNT_WDIS); + ebbrt::kprintf_force("%u: rx-usecs = %u\n", i, ixgmq[i]->itr_val*2); + } else if(s == "rapl") { + if(i == 0 || i == 1) { + ixgmq[i]->powerMeter.SetLimit(v); + ebbrt::kprintf_force("%u: rapl = %u\n", i, v); } - - ixgmq_.rsc_chain_.emplace_back( - std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); - - // bump head ptr - ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - - *process_rsc = true; - - return 1; - } else { - // shouldn't hit here - ebbrt::kabort("%s Not sure what state\n", __FUNCTION__); - } */ - - //ebbrt::kprintf("\t Core: %d ReceivePoll() len=%d rxhead=%d num_chains=%d *** \n\n", mcore, len, rxhead, b->CountChainElements()); - - /*if (len > 60) { - ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d START\n", mcore, len, rxhead); - auto p1 = reinterpret_cast(b->MutData()); - int i=0; - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - ebbrt::kprintf("Core: %d ReceivePoll() len=%d rxhead=%d END\n", mcore, len, rxhead); - for (int i = 0; i < (int)len; i+=8) { - if (i+8 < (int)len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } else { - for(int j = i; j < (int)len; j++) { - ebbrt::kprintf("%02X\n", p1[j]); - } - } - } - }*/ - /*if (len > 60) { - ebbrt::kprintf("\t ReceivePoll on core: %d len=%u\n", mcore, len); - }*/ - //ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); - - - /*// done with buffer addr above, now to reuse it - auto tail = ixgmq_.rx_tail_; - - // bump tail ptr - ixgmq_.rx_tail_ = (tail + 1) % ixgmq_.rx_size_; - - count++; - - if (count > 0) { - auto tail = ixgmq_.rx_tail_; - - // TODO hack - need to set actual length of data otherwise it'll send - // leftover 0's - ixgmq_.circ_buffer_[tail]->SetLength(len); - - // TODO hack - need to reallocate IOBuf after its been moved to Receive - auto b = std::move(ixgmq_.circ_buffer_[tail]); - - ixgmq_.circ_buffer_[tail] = - std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); - auto rxphys = - reinterpret_cast((ixgmq_.circ_buffer_[tail])->MutData()); - - ixgmq_.rx_ring_[tail].buffer_address = rxphys; - - // dump eth packet info - //if(len > 1500 && len < 1600) { - ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); - - auto p1 = reinterpret_cast(b->MutData()); - for (int i = 0; i < (int)len; i+=8) { - if (i+8 < (int)len) { - ebbrt::kprintf("%02X%02X%02X%02X%02X%02X%02X%02X\n", p1[i], p1[i+1], p1[i+2], p1[i+3], p1[i+4], p1[i+5], p1[i+6], p1[i+7]); - } - else{ - ebbrt::kprintf("%02X\n", p1[i]); - } - } - //} - - root_.itf_.Receive(std::move(b), rxflag); - }*/ - - /*if(ixgmq_.time_send > 0) { - auto d = ebbrt::clock::Wall::Now().time_since_epoch(); - uint64_t endt = std::chrono::duration_cast(d).count(); - uint64_t idlet = endt - ixgmq_.time_send; - uint64_t idlet_mod = (idlet / 50) * 50; - - auto got = ixgmq_.idle_times_.find(idlet_mod); - // not found - if(got == ixgmq_.idle_times_.end()) - ixgmq_.idle_times_[idlet_mod] = 1; - else - ixgmq_.idle_times_[idlet_mod] ++; - - ixgmq_.time_idle_min = idlet < ixgmq_.time_idle_min ? idlet : ixgmq_.time_idle_min; - ixgmq_.time_idle_max = idlet > ixgmq_.time_idle_max ? idlet : ixgmq_.time_idle_max; - ixgmq_.total_idle_time += idlet; - }*/ - -// keep check for new packets to receive -// may wait for RSC to be done -// uint32_t ebbrt::IxgbeDriverRep::GetRxBuf(uint32_t* len, uint64_t* bAddr, -// uint64_t* rxflag, bool* process_rsc, -// uint32_t* rnt, uint32_t* rxhead) { -// rdesc_adv_wb_t* tmp; -// tmp = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); -// uint32_t i; -// //uint32_t mcore = static_cast(Cpu::GetMine()); - -// // if no rx packets ready -// if (!(tmp->dd)) { -// return 0; -// } - -// //auto rsccnt = tmp->rsccnt; - -// // makes sure all reads are finished before -// asm volatile("lfence":::"memory"); - -// //ebbrt::kprintf("rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", *rxhead, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); - -// // not RSC, handled normally -// // if (rsccnt == 0 && tmp->eop && tmp->dd) { -// if (tmp->eop && tmp->dd) { -// *len = tmp->pkt_len; - -// /* set rx flags */ -// // TCP/UDP checksum -// if (tmp->l4i) { -// *rxflag |= RXFLAG_L4CS; -// if (!(tmp->l4e)) { -// *rxflag |= RXFLAG_L4CS_VALID; -// } -// } - -// // Ipv4 checksum -// if (tmp->ipcs) { -// *rxflag |= RXFLAG_IPCS; -// if (!(tmp->ipe)) { -// *rxflag |= RXFLAG_IPCS_VALID; -// } -// } - -// *rxhead = ixgmq_.rx_head_; -// //ebbrt::kprintf("** GetRxBuf START **\n \t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", ixgmq_.rx_head_, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); -// //ebbrt::kprintf("\t rx_head=%d rsccnt=%d len=%d rss_type=0x%X rss_hash=0x%X\n", *rxhead, rsccnt, tmp->pkt_len, tmp->rss_type, tmp->rss_hash); - -// // bump head ptr -// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; - -// return 1; -// } else if(!(tmp->eop) && tmp->dd) { -// uint32_t desc_count = 0; -// uint32_t start_header = ixgmq_.rx_head_; - -// /*** RSC FIRST PACKET ***/ -// ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(tmp->pkt_len); -// auto b = std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_]); - -// // bump head ptr -// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; -// // bump tail ptr -// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; -// desc_count ++; - -// rdesc_adv_wb_t* tmp_next; -// uint32_t next_desc = tmp->next_descriptor_ptr; -// //ebbrt::kprintf("\nRSC start_desc=%d next_desc=%d len=%d\n", start_header, next_desc, tmp->pkt_len); - -// // hopefully won't happen @@ -// if(next_desc != ixgmq_.rx_head_) { -// ebbrt::kabort("1) next_desc=%d != ixgmq_.rx_head_=%d\n", next_desc, ixgmq_.rx_head_); -// } -// tmp_next = reinterpret_cast(&(ixgmq_.rx_ring_[ixgmq_.rx_head_])); - -// /*** RSC MIDDILE CHAINS ***/ -// while(!(tmp_next->eop)) { -// desc_count ++; -// ixgmq_.circ_buffer_[next_desc]->SetLength(tmp_next->pkt_len); -// b->PrependChain(std::move(ixgmq_.circ_buffer_[next_desc])); - -// // bump head ptr -// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; -// // bump tail ptr -// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; - -// //ebbrt::kprintf("curr_desc=%d, next_desc=%d len=%d\n", next_desc, tmp_next->next_descriptor_ptr, tmp_next->pkt_len); - -// next_desc = tmp_next->next_descriptor_ptr; -// if(next_desc != ixgmq_.rx_head_) { -// ebbrt::kprintf("2) next_desc=%d != ixgmq_.rx_head_=%d\n", next_desc, ixgmq_.rx_head_); -// } -// tmp_next = reinterpret_cast(&(ixgmq_.rx_ring_[next_desc])); -// } - -// /*** RSC LAST PACKET ***/ -// desc_count ++; -// //ebbrt::kprintf("\t GetRxBuf() rx_head_=%d rsccnt=%d len=%d dd=%u eop=%d\n", rh, tmp2->rsccnt, tmp2->pkt_len, tmp2->dd, tmp2->eop); -// if (tmp_next->l4i) { -// *rxflag |= RXFLAG_L4CS; -// if (!(tmp_next->l4e)) { -// *rxflag |= RXFLAG_L4CS_VALID; -// } -// } -// // Ipv4 checksum -// if (tmp_next->ipcs) { -// *rxflag |= RXFLAG_IPCS; -// if (!(tmp_next->ipe)) { -// *rxflag |= RXFLAG_IPCS_VALID; -// } -// } - -// ixgmq_.circ_buffer_[ixgmq_.rx_head_]->SetLength(tmp_next->pkt_len); -// b->PrependChain(std::move(ixgmq_.circ_buffer_[ixgmq_.rx_head_])); - -// //ebbrt::kprintf("LAST RSC desc = %d len=%d tail=%d\n\n", ixgmq_.rx_head_, tmp_next->pkt_len, ixgmq_.rx_tail_); - -// // bump head ptr -// ixgmq_.rx_head_ = (ixgmq_.rx_head_ + 1) % ixgmq_.rx_size_; -// // bump tail ptr -// ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; - -// // Process Packet -// root_.itf_.Receive(std::move(b), *rxflag); - -// // reset descriptors -// for (i=0;i((ixgmq_.circ_buffer_[start_header+i])->MutData()); -// // update descriptor with new buffer adder -// ixgmq_.rx_ring_[start_header+i].buffer_address = rxphys; -// } - -// // tell NIC which descriptors are free -// asm volatile("lfence" ::: "memory"); -// asm volatile("sfence" ::: "memory"); -// //WriteRdt_1(mcore, ixgmq_.rx_tail_); - -// // Clean up descriptors -// *process_rsc = true; -// return 1; -// } else { -// ebbrt::kabort("\t GetRxBuf(): Unknown RX packet descriptor\n"); -// return 0; -// } - -// /*ixgmq_.rsc_chain_.clear(); - -// ixgmq_.rsc_chain_.emplace_back( -// std::make_pair(ixgmq_.rx_head_, static_cast(tmp->pkt_len))); -// //ebbrt::kprintf("*** GetRxBuf START\n \t rx_head=%u rsccnt=%u len=%d dd=%u eop=%u nextp=%u\n", ixgmq_.rx_head_, rsccnt, tmp->pkt_len, tmp->dd, tmp->eop, tmp->next_descriptor_ptr); - -// rdesc_adv_wb_t* tmp2; -// uint32_t rh = tmp->next_descriptor_ptr; -// tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[rh])); -// while(!(tmp2->eop)) { -// //ebbrt::kprintf_force("\t GetRxBuf() rx_head_=%d rsccnt=%d len=%d dd=%u eop=%d nextp=%d \n", rh, tmp2->rsccnt, tmp2->pkt_len, tmp2->dd, tmp2->eop, tmp2->next_descriptor_ptr); -// ixgmq_.rsc_chain_.emplace_back( -// std::make_pair(rh, static_cast(tmp2->pkt_len))); -// rh = tmp2->next_descriptor_ptr; -// tmp2 = reinterpret_cast(&(ixgmq_.rx_ring_[rh])); -// } + } else if(s == "start_stats") { + //ebbrt::kprintf_force("start_stats on core %u\n", v); + ixgmq[v]->collect_stats = true; + ixgmq[v]->perfCycles.Start(); + ixgmq[v]->perfRefCycles.Start(); + ixgmq[v]->perfInst.Start(); + ixgmq[v]->perfLLC_miss.Start(); + ixgmq[v]->powerMeter.Start(); + } + else if(s == "stop_stats") { + //ebbrt::kprintf_force("stop_stats on core %u\n", v); + ixgmq[v]->collect_stats = false; + ixgmq[v]->perfCycles.Stop(); + ixgmq[v]->perfRefCycles.Stop(); + ixgmq[v]->perfInst.Stop(); + ixgmq[v]->perfLLC_miss.Stop(); + ixgmq[v]->powerMeter.Stop(); - + } else if(s == "clear_stats") { + //ebbrt::kprintf_force("clear_stats on core %u\n", v); + ixgmq[v]->perfCycles.Clear(); + ixgmq[v]->perfRefCycles.Clear(); + ixgmq[v]->perfInst.Clear(); + ixgmq[v]->perfLLC_miss.Clear(); + ixgmq[v]->powerMeter.Clear(); -// ixgmq_.rsc_chain_.emplace_back( -// std::make_pair(rh, static_cast(tmp2->pkt_len))); - -// // bump head ptr -// ixgmq_.rx_head_ = (rh + 1) % ixgmq_.rx_size_; -// *process_rsc = true; -// return 1;*/ - -// return 0; -// } - -/*void ebbrt::IxgbeDriverRep::ReceivePoll() { - uint32_t len; - uint64_t bAddr; - uint64_t rxflag; - bool process_rsc; - uint32_t count; - uint32_t rnt; - uint32_t rxhead; - process_rsc = false; - rxflag = 0; - count = 0; - rnt = 0; - //uint32_t mcore = static_cast(Cpu::GetMine()); -#ifdef STATS_EN - ixgmq_.stat_num_recv ++; -#endif - - // while there are still packets received - while (GetRxBuf(&len, &bAddr, &rxflag, &process_rsc, &rnt, &rxhead) == 1) { - // hit last rsc context, start to process all buffers - if (process_rsc) { - } - else { - count ++; - -#ifdef STATS_EN - ixgmq_.stat_num_rx_bytes += len; -#endif - - ixgmq_.circ_buffer_[rxhead]->SetLength(len); - auto b = std::move(ixgmq_.circ_buffer_[rxhead]); - - // bump tail ptr - ixgmq_.rx_tail_ = (ixgmq_.rx_tail_ + 1) % ixgmq_.rx_size_; - //ebbrt::kprintf("\t ReceivePoll() on core: %d len=%d\n", mcore, len); - - root_.itf_.Receive(std::move(b), rxflag); - - // reset buffer - ixgmq_.rx_ring_[rxhead].raw[0] = 0; - ixgmq_.rx_ring_[rxhead].raw[1] = 0; - // allocate new rx buffer - ixgmq_.circ_buffer_[rxhead] = std::move(MakeUniqueIOBuf(IxgbeDriver::RXBUFSZ)); - auto rxphys = - reinterpret_cast((ixgmq_.circ_buffer_[rxhead])->MutData()); - // update buffer with new adder - ixgmq_.rx_ring_[rxhead].buffer_address = rxphys; - - asm volatile("lfence" ::: "memory"); - asm volatile("sfence" ::: "memory"); - //WriteRdt_1(mcore, ixgmq_.rx_tail_); - + //memset(ixgbe_logs[v], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); + for (uint32_t i = 0; i < ixgbe_stats[v].itr_cnt; i++) { + ixgbe_logs[v][i].Fields.rx_desc=0; + ixgbe_logs[v][i].Fields.rx_bytes=0; + ixgbe_logs[v][i].Fields.tx_desc=0; + ixgbe_logs[v][i].Fields.tx_bytes=0; + ixgbe_logs[v][i].Fields.ninstructions=0; + ixgbe_logs[v][i].Fields.ncycles=0; + ixgbe_logs[v][i].Fields.nref_cycles=0; + ixgbe_logs[v][i].Fields.nllc_miss=0; + ixgbe_logs[v][i].Fields.c3=0; + ixgbe_logs[v][i].Fields.c6=0; + ixgbe_logs[v][i].Fields.c7=0; + ixgbe_logs[v][i].Fields.joules=0; + ixgbe_logs[v][i].Fields.tsc=0; } + ixgbe_stats[v].itr_joules_last_tsc = 0; + ixgbe_stats[v].itr_cnt =0; + ixgbe_stats[v].itr_cnt2 =0; + ixgbe_stats[v].rdtsc_start = 0; + ixgbe_stats[v].rdtsc_end = 0; + ixgbe_stats[v].repeat =0; + ixgbe_stats[v].dvfs =0; + ixgbe_stats[v].rapl =0; + ixgbe_stats[v].itr =0; + ixgbe_stats[v].iter =0; + } else { + ebbrt::kprintf_force("%s Unknown command: %s\n", __FUNCTION__, s.c_str()); } - - // TODO: Update tail register here or above? -// if (count > 0) { - // update reg - // WriteRdt_1(mcore, ixgmq_.rx_tail_); - //} - }*/ +} diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index ae0e1206..8b4c2e14 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -20,6 +20,7 @@ #include "SlabAllocator.h" #include "Perf.h" #include "Rapl.h" +#include "Trace.h" // Receive Side Scaling (RSC) enabled //#define RSC_EN @@ -30,30 +31,54 @@ //#define JUMBO_EN // Collect Statistics Flag -#define STATS_EN +//#define STATS_EN //#define MAX_DESC +union IxgbeLogEntry { + long long data[12]; + struct { + long long tsc; + long long ninstructions; + long long ncycles; + long long nref_cycles; + long long nllc_miss; + long long joules; + long long c3; + long long c6; + long long c7; + + int rx_desc; + int rx_bytes; + int tx_desc; + int tx_bytes; + + long long pad; + } __attribute((packed)) Fields; +} __attribute((packed)); + +#define IXGBE_CACHE_LINE_SIZE 64 +#define IXGBE_LOG_SIZE 4000000U +#define TSC_KHZ 2899999 + +struct IxgbeLog { + uint64_t itr_joules_last_tsc; + uint64_t rdtsc_start; + uint64_t rdtsc_end; + uint32_t itr_cnt; + uint32_t itr_cnt2; + uint32_t repeat; + uint32_t dvfs; + uint32_t rapl; + uint32_t itr; + uint32_t iter; +} __attribute__((packed, aligned(IXGBE_CACHE_LINE_SIZE))); + +extern struct IxgbeLog ixgbe_stats[16]; +extern union IxgbeLogEntry *ixgbe_logs[16]; +extern std::unique_ptr bsendbufs[16]; namespace ebbrt { -// Per-core receive and transmit queue -typedef struct { - rdesc_legacy_t* rx_ring; - uint32_t rx_head; - uint32_t rx_tail; - uint32_t rx_size; - - tdesc_legacy_t* tx_ring; - uint32_t* tx_head; - uint32_t tx_tail; - uint32_t tx_last_tail; - uint32_t tx_size; - bool* tx_isctx; - - // buffers holding packet data - std::vector> circ_buffer; -} e10k_queue_t; - class IxgbeDriverRep; class IxgbeDriver : public EthernetDevice { @@ -150,13 +175,6 @@ class IxgbeDriver : public EthernetDevice { circ_buffer_.emplace_back(MakeUniqueIOBuf(RXBUFSZ, true)); } - // rsc_chain_ is a map between receive descriptor number and - // packet len, need packet len to extract out - // packet data else code will read redundant - // zeros if packet len does not use full buffer - // TODO: should be optimized - rsc_chain_.reserve(NRXDESCS+1); - // keep a log of number of idle times idle_times_.reserve(NRXDESCS); @@ -165,11 +183,6 @@ class IxgbeDriver : public EthernetDevice { for (uint32_t k = 0; k < NRXDESCS; k++) { tx_iseop[k] = false; } - - // keeps a log of descriptors where eop == 1 - // used to coalesce reclaiming of tx descriptors - // once the threshold of some limit is hit - //send_to_watch.reserve(NRXDESCS); // RX ring buffer allocation auto sz = align::Up(sizeof(rdesc_legacy_t) * NRXDESCS, 4096); @@ -191,16 +204,6 @@ class IxgbeDriver : public EthernetDevice { memset(addr, 0, sz); tx_ring_ = static_cast(addr); - // TX adv context buffer allocation - /*sz = align::Up(sizeof(bool) * NTXDESCS, 4096); - order = Fls(sz - 1) - pmem::kPageShift + 1; - page = page_allocator->Alloc(order, nid); - kbugon(page == Pfn::None(), "ixgbe: page allocation failed in %s", - __FUNCTION__); - addr = reinterpret_cast(page.ToAddr()); - memset(addr, 0, sz); - tx_isctx_ = static_cast(addr);*/ - #ifdef TX_HEAD_WB // TODO: not sure how much exactly to allocate for head wb addr tx_head_ = (uint32_t*)malloc(4 * sizeof(uint32_t)); @@ -251,9 +254,7 @@ class IxgbeDriver : public EthernetDevice { uint64_t cleaned_count{0}; std::vector> circ_buffer_; - std::vector> rsc_chain_; std::unordered_map idle_times_; - std::vector> send_to_watch; std::vector tx_iseop; std::ostringstream str_stats; //std::vector send_to_watch; @@ -272,10 +273,13 @@ class IxgbeDriver : public EthernetDevice { #endif // stats - uint64_t stat_num_recv{0}; - uint64_t stat_num_send{0}; - uint64_t stat_num_rx_bytes{0}; - uint64_t stat_num_tx_bytes{0}; + int stat_num_rx_desc{0}; + int stat_num_tx_desc{0}; + int stat_num_rx_bytes{0}; + int stat_num_tx_bytes{0}; + uint64_t total_tx_bytes{0}; + uint64_t total_rx_bytes{0}; + uint64_t time_us{0}; uint64_t time_send{0}; uint64_t time_idle_min{999999}; @@ -288,6 +292,9 @@ class IxgbeDriver : public EthernetDevice { uint64_t fireCount{0}; uint32_t rapl_val{666}; uint32_t itr_val{8}; + std::chrono::nanoseconds itr_joules_last_ts{0}; + bool collect_stats{false}; + std::vector tx_desc_counts; std::vector rx_desc_counts; double totalNrg{0.0}; @@ -298,6 +305,7 @@ class IxgbeDriver : public EthernetDevice { bool stat_init{false}; ebbrt::perf::PerfCounter perfCycles; ebbrt::perf::PerfCounter perfInst; + ebbrt::perf::PerfCounter perfRefCycles; ebbrt::perf::PerfCounter perfLLC_ref; ebbrt::perf::PerfCounter perfLLC_miss; ebbrt::perf::PerfCounter perfTLB_store_miss; @@ -527,7 +535,6 @@ class IxgbeDriver : public EthernetDevice { // dump per core stats if STATS_EN void DumpStats(); - e10k_queue_t& GetQueue() const { return *ixgq; } e10Kq& GetMultiQueue(uint32_t index) const { return *ixgmq[index]; } @@ -540,7 +547,6 @@ class IxgbeDriver : public EthernetDevice { volatile uint32_t kIxgbeStatus; }; - e10k_queue_t* ixgq; uint8_t rcv_vector{0}; std::vector> ixgmq; @@ -558,7 +564,8 @@ class IxgbeDriverRep : public MulticoreEbb, Timer:: void Send(std::unique_ptr buf, PacketInfo pinfo); void SendUdp(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); void SendTCPChained(std::unique_ptr buf, uint64_t len, uint64_t num_chains, PacketInfo pinfo); - void SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + //void SendTCPUnchained(std::unique_ptr buf, uint64_t len, PacketInfo pinfo); + void SendTCPUnchained(uint64_t bdata, uint64_t len, PacketInfo pinfo); //void AddContext(uint8_t idx, uint8_t maclen, uint16_t iplen, uint8_t l4len, // enum l4_type l4type); @@ -583,7 +590,6 @@ class IxgbeDriverRep : public MulticoreEbb, Timer:: void Fire() override; const IxgbeDriver& root_; - e10k_queue_t& ixgq_; IxgbeDriver::e10Kq& ixgmq_; EventManager::IdleCallback receive_callback_; diff --git a/src/native/Rapl.h b/src/native/Rapl.h index 1b2a326b..8d92ea79 100644 --- a/src/native/Rapl.h +++ b/src/native/Rapl.h @@ -92,6 +92,16 @@ namespace rapl { RaplCounter& operator=(const RaplCounter& other) = delete; ~RaplCounter(); + + void Clear() { + counter_offset = 0.0; + } + + uint64_t ReadMsr() + { + return ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); + } + void Start() { uint64_t res = ebbrt::msr::Read(kMsrIntelPkgEnergyStatus); counter_offset = (double)res*rapl_cpu_energy_units; @@ -108,7 +118,7 @@ namespace rapl { double after = (double)res*rapl_cpu_energy_units; //ebbrt::kprintf("Package Energy after: %.6fJ\n", after); counter_offset = after - counter_offset; - //ebbrt::kprintf("Total Package Energy used: %.6fJ\n", after - counter_offset); + ebbrt::kprintf_force("Total Package Energy used: %.6fJ\n", counter_offset); } void SetLimit(uint32_t v) { @@ -134,7 +144,7 @@ namespace rapl { uint32_t high = (result >> 32) & 0xFFFFFFFF; asm volatile("wrmsr" : : "c"(kMsrPkgRaplPowerLimit), "a"(low), "d"(high)); - result=ebbrt::msr::Read(kMsrPkgRaplPowerLimit); + /*result=ebbrt::msr::Read(kMsrPkgRaplPowerLimit); ebbrt::kprintf("%u Package power limits are %s\n", v, (result >> 63) ? "locked" : "unlocked"); double pkg_power_limit_1 = rapl_power_units*(double)((result>>0)&0x7FFF); double pkg_time_window_1 = rapl_time_units*(double)((result>>17)&0x007F); @@ -148,6 +158,7 @@ namespace rapl { pkg_power_limit_2, pkg_time_window_2, (result & (1LL<<47)) ? "enable power limit" : "disabled", (result & (1LL<<48)) ? "clamped" : "not_clamped"); + */ } double Read(); diff --git a/src/native/Uart8250.cc b/src/native/Uart8250.cc index eba7cd25..e510e04f 100644 --- a/src/native/Uart8250.cc +++ b/src/native/Uart8250.cc @@ -11,7 +11,7 @@ #include "Io.h" namespace { -const constexpr uint16_t kPort = 0x3f8; +const constexpr uint16_t kPort = 0x2f8; // when DLAB = 0 const constexpr uint16_t kDataReg = 0; const constexpr uint16_t kIntEnable = 1; From eb4072ab2729a7f877a8e00b5cc13db7791554de Mon Sep 17 00:00:00 2001 From: Han Date: Thu, 29 Oct 2020 16:03:15 -0400 Subject: [PATCH 19/20] fixed ref cycle counter, previous implementation did not report correct value --- src/native/EventManager.cc | 2 +- src/native/EventManager.h | 2 + src/native/IxgbeDriver.cc | 195 +++++++++++++++++++++++++++++-------- src/native/IxgbeDriver.h | 8 +- 4 files changed, 163 insertions(+), 44 deletions(-) diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index 705a0cc4..f557bd61 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -173,7 +173,7 @@ void ebbrt::EventManager::Process() { // C6 0x20 // C7 0x30 ecx = 1; /* break on interrupt flag */ - eax = 0x30; + eax = 0x30; /* we always pick the deepest sleep state */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); diff --git a/src/native/EventManager.h b/src/native/EventManager.h index ba3195c8..257f008c 100644 --- a/src/native/EventManager.h +++ b/src/native/EventManager.h @@ -23,6 +23,8 @@ #include "Trans.h" #include "VMemAllocator.h" +extern uint64_t nsleep_states; + namespace ebbrt { class EventManager : Timer::Hook { diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 5ae00bea..6424cb86 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -536,6 +536,11 @@ void ebbrt::IxgbeDriverRep::Send(std::unique_ptr buf, PacketInfo pinfo) { return; } num_chains = buf->CountChainElements(); + + /*if(mcore == 15) { + ebbrt::kprintf_force("\t Sending on core 15??"); + return; + }*/ #ifndef TX_HEAD_WB ReclaimTx(); @@ -1615,7 +1620,7 @@ void ebbrt::IxgbeDriver::GlobalReset() { **/ void ebbrt::IxgbeDriver::Init() { uint64_t d_mac; - uint32_t ncore = static_cast(Cpu::Count()); + ebbrt::kprintf("%s ", __PRETTY_FUNCTION__); bar0_.Map(); // allocate virtual memory @@ -1811,6 +1816,46 @@ void ebbrt::IxgbeDriver::Init() { // Fill in RSS redirection table (128 entries), sets which core the lowest 7 bits of hashed output goes to // hacky atm + // memcached-silo -- avoid firing interrupts on core 15 + /*uint32_t i = 0; + WriteReta(i+0, 0x03020100); + WriteReta(i+1, 0x07060504); + WriteReta(i+2, 0x0B0A0908); + WriteReta(i+3, 0x000E0D0C); + WriteReta(i+4, 0x04030201); + WriteReta(i+5, 0x08070605); + WriteReta(i+6, 0x0C0B0A09); + WriteReta(i+7, 0x01000E0D); + + WriteReta(i+8, 0x05040302); + WriteReta(i+9, 0x09080706); + WriteReta(i+10, 0x0D0C0B0A); + WriteReta(i+11, 0x0201000E); + WriteReta(i+12, 0x06050403); + WriteReta(i+13, 0x0A090807); + WriteReta(i+14, 0x0E0D0C0B); + WriteReta(i+15, 0x03020100); + + WriteReta(i+16, 0x07060504); + WriteReta(i+17, 0x0B0A0908); + WriteReta(i+18, 0x000E0D0C); + WriteReta(i+19, 0x04030201); + WriteReta(i+20, 0x08070605); + WriteReta(i+21, 0x0C0B0A09); + WriteReta(i+22, 0x01000E0D); + WriteReta(i+23, 0x05040302); + + WriteReta(i+24, 0x09080706); + WriteReta(i+25, 0x0D0C0B0A); + WriteReta(i+26, 0x0201000E); + WriteReta(i+27, 0x06050403); + WriteReta(i+28, 0x0A090807); + WriteReta(i+29, 0x0E0D0C0B); + WriteReta(i+30, 0x03020100); + WriteReta(i+31, 0x07060504); + */ + + uint32_t ncore = static_cast(Cpu::Count()); for (auto i = 0; i < 32; i += 4) { // all route to core 0 if(ncore == 1) { @@ -1819,7 +1864,7 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+2, 0x0000000); WriteReta(i+3, 0x0000000); } else if(ncore == 2) { - WriteReta(i, 0x1010100); + WriteReta(i, 0x1010100); WriteReta(i+1, 0x1010100); WriteReta(i+2, 0x1010100); WriteReta(i+3, 0x1010100); @@ -1839,15 +1884,19 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+2, 0x3020100); WriteReta(i+3, 0x7060504); } else if(ncore == 16){ - /*WriteReta(i, 0x03020100); + // memcached + /*WriteReta(i+0, 0x03020100); WriteReta(i+1, 0x07060504); WriteReta(i+2, 0x0B0A0908); - WriteReta(i+3, 0x0F0E0D0C);*/ - ebbrt::kprintf_force("+++ all interrupts firing on Core 1\n"); - WriteReta(i, 0x1010101); - WriteReta(i+1, 0x1010101); - WriteReta(i+2, 0x1010101); - WriteReta(i+3, 0x1010101); + WriteReta(i+3, 0x0F0E0D0C); + */ + + // nodejs -- all on core 1 + ebbrt::kprintf_force("*** NodeJS firing all on core 1\n"); + WriteReta(i+0, 0x01010101); + WriteReta(i+1, 0x01010101); + WriteReta(i+2, 0x01010101); + WriteReta(i+3, 0x01010101); } else { ebbrt::kabort("%s: Can only redirect interrupts to 16 cores\n", __FUNCTION__); } @@ -2029,7 +2078,7 @@ void ebbrt::IxgbeDriver::SetupMultiQueue(uint32_t i) { // setup RX interrupts for queue i dev_.SetMsixEntry(i, rcv_vector, ebbrt::Cpu::GetByIndex(i)->apic_id()); - //ebbrt::kprintf("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); + ebbrt::kprintf_force("Core %d: BSIZEPACKET=%d bytes NTXDESCS=%d NRXDESCS=%d ITR_INTERVAL=%dus RCV_VECTOR=%d APIC_ID=%d \n", i, RXBUFSZ, NTXDESCS, NRXDESCS, (int) (IxgbeDriver::ITR_INTERVAL * 2), (int)rcv_vector, (int)(ebbrt::Cpu::GetByIndex(i)->apic_id())); // don't set up interrupts for tx since we have head writeback?? auto qn = i / 2; // put into correct IVAR @@ -2174,8 +2223,38 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint64_t now = 0, last = 0; uint64_t cjoules, cins, ccyc, crefcyc, cllc; uint64_t c3, c6, c7; + //uint32_t eicr; + + c3 = c6 = c7 = 0; + + if(ixgmq_.start_perf == false) { + uint32_t index, low, high; + uint64_t data; + + data = 0x333; + index = 0x38D; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + data = 0x43412E; + index = 0x186; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + data = 0x700000001; + index = 0x38F; + low = (uint32_t)(data & 0xFFFFFFFF); + high = (data >> 32) & 0xFFFFFFFF; + asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); + + ixgmq_.start_perf = true; + } if(ixgmq_.collect_stats) { + ccyc = 0; + cllc = 0; icnt = ixgbe_stats[mcore].itr_cnt; ixgbe_stats[mcore].itr_cnt2 ++; @@ -2183,7 +2262,12 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { //get current tsc and store it now = ebbrt::trace::rdtsc(); kassert(now != 0); - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); + + //eicr = ReadEicr(); + //ixgbe_logs[mcore][icnt].Fields.c3 = eicr; + + //__builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); + ixgbe_logs[mcore][icnt].Fields.tsc = now; // get last tsc last = ixgbe_stats[mcore].itr_joules_last_tsc; @@ -2191,25 +2275,42 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { // ~ 1 ms has passed if ((now - last) > TSC_KHZ) { cjoules = ixgmq_.powerMeter.ReadMsr(); - cins = ixgmq_.perfInst.Read(); - ccyc = ixgmq_.perfCycles.Read(); - crefcyc = ixgmq_.perfRefCycles.Read(); - cllc = ixgmq_.perfLLC_miss.Read(); - - c3 = ebbrt::msr::Read(0x3FC); - c6 = ebbrt::msr::Read(0x3FD); - c7 = ebbrt::msr::Read(0x3FE); - - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); + if (ixgmq_.start_perf) { + cins = ebbrt::msr::Read(0x309); + ccyc = ebbrt::msr::Read(0x30A); + crefcyc = ebbrt::msr::Read(0x30B); + cllc = ebbrt::msr::Read(0xC1); + } + //cins = ixgmq_.perfInst.Read(); + //ccyc = ixgmq_.perfCycles.Read(); + //crefcyc = ixgmq_.perfRefCycles.Read(); + //cllc = ixgmq_.perfLLC_miss.Read(); + + //c3 = ebbrt::msr::Read(0x3FC); + //c6 = ebbrt::msr::Read(0x3FD); + //c7 = ebbrt::msr::Read(0x3FE); + + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ninstructions), cins); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ncycles), ccyc); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nref_cycles), crefcyc); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nllc_miss), cllc); + /*ixgbe_logs[mcore][icnt].Fields.joules = cjoules; + ixgbe_logs[mcore][icnt].Fields.ninstructions = cins; + ixgbe_logs[mcore][icnt].Fields.ncycles = ccyc; + ixgbe_logs[mcore][icnt].Fields.nref_cycles = crefcyc; + ixgbe_logs[mcore][icnt].Fields.nllc_miss = cllc;*/ + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c3), c3); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c6), c6); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c7), c7); + /*ixgbe_logs[mcore][icnt].Fields.c3 = c3; + ixgbe_logs[mcore][icnt].Fields.c6 = c6; + ixgbe_logs[mcore][icnt].Fields.c7 = c7; + */ + ixgbe_stats[mcore].itr_joules_last_tsc = now; } @@ -2217,7 +2318,12 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_bytes), ixgmq_.stat_num_rx_bytes); __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_desc), ixgmq_.stat_num_tx_desc); __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_bytes), ixgmq_.stat_num_tx_bytes); - + + /*ixgbe_logs[mcore][icnt].Fields.rx_desc = ixgmq_.stat_num_rx_desc; + ixgbe_logs[mcore][icnt].Fields.rx_bytes = ixgmq_.stat_num_rx_bytes; + ixgbe_logs[mcore][icnt].Fields.tx_desc = ixgmq_.stat_num_tx_desc; + ixgbe_logs[mcore][icnt].Fields.tx_bytes = ixgmq_.stat_num_tx_bytes;*/ + ixgmq_.stat_num_rx_bytes = 0; ixgmq_.stat_num_rx_desc = 0; ixgmq_.stat_num_tx_bytes = 0; @@ -2399,9 +2505,15 @@ ebbrt::IxgbeDriverRep::IxgbeDriverRep(const IxgbeDriver& root) : root_(root), ixgmq_(root.GetMultiQueue(Cpu::GetMine())), receive_callback_([this]() { ReceivePoll(); }) { //this->ReceivePoll(); - ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); - ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); - ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + //ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_cycles); + //ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_instructions); + + //ixgmq_.perfRefCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::fixed_reference_cycles); + //ixgmq_.perfInst = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::instructions); + //ixgmq_.perfCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::cycles); + //ixgmq_.perfRefCycles = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::reference_cycles); + //ixgmq_.perfLLC_miss = ebbrt::perf::PerfCounter(ebbrt::perf::PerfEvent::llc_misses); + ixgmq_.powerMeter = ebbrt::rapl::RaplCounter(); } @@ -2415,7 +2527,7 @@ void ebbrt::IxgbeDriverRep::IxgbeDriverRep::StopTimer() { } void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { - uint32_t mcore = static_cast(Cpu::GetMine()); + /*uint32_t mcore = static_cast(Cpu::GetMine()); ixgmq_.perfCycles.Stop(); ixgmq_.perfInst.Stop(); @@ -2442,10 +2554,15 @@ void ebbrt::IxgbeDriverRep::IxgbeDriverRep::Fire() { if(mcore == 0 || mcore == 1) { ixgmq_.powerMeter.Start(); } - ixgmq_.fireCount += 1; + ixgmq_.fireCount += 1; */ //ebbrt::kprintf_force("Core %u: Fire() %llu\n", mcore, ixgmq_.fireCount); } +uint32_t ebbrt::IxgbeDriverRep::ReadEicr() { + auto reg = root_.bar0_.Read32(0x00800); + return reg & 0xFFFFFFFF; +} + uint16_t ebbrt::IxgbeDriverRep::ReadRdh_1(uint32_t n) { auto reg = root_.bar0_.Read32(0x01010 + 0x40 * n); return reg & 0xFFFF; @@ -2520,27 +2637,27 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { } else if(s == "start_stats") { //ebbrt::kprintf_force("start_stats on core %u\n", v); ixgmq[v]->collect_stats = true; - ixgmq[v]->perfCycles.Start(); - ixgmq[v]->perfRefCycles.Start(); - ixgmq[v]->perfInst.Start(); - ixgmq[v]->perfLLC_miss.Start(); + //ixgmq[v]->perfCycles.Start(); + //ixgmq[v]->perfRefCycles.Start(); + //ixgmq[v]->perfInst.Start(); + //ixgmq[v]->perfLLC_miss.Start(); ixgmq[v]->powerMeter.Start(); } else if(s == "stop_stats") { //ebbrt::kprintf_force("stop_stats on core %u\n", v); ixgmq[v]->collect_stats = false; - ixgmq[v]->perfCycles.Stop(); - ixgmq[v]->perfRefCycles.Stop(); - ixgmq[v]->perfInst.Stop(); - ixgmq[v]->perfLLC_miss.Stop(); + //ixgmq[v]->perfCycles.Stop(); + //ixgmq[v]->perfRefCycles.Stop(); + //ixgmq[v]->perfInst.Stop(); + //ixgmq[v]->perfLLC_miss.Stop(); ixgmq[v]->powerMeter.Stop(); } else if(s == "clear_stats") { //ebbrt::kprintf_force("clear_stats on core %u\n", v); - ixgmq[v]->perfCycles.Clear(); - ixgmq[v]->perfRefCycles.Clear(); - ixgmq[v]->perfInst.Clear(); - ixgmq[v]->perfLLC_miss.Clear(); + //ixgmq[v]->perfCycles.Clear(); + //ixgmq[v]->perfRefCycles.Clear(); + //ixgmq[v]->perfInst.Clear(); + //ixgmq[v]->perfLLC_miss.Clear(); ixgmq[v]->powerMeter.Clear(); //memset(ixgbe_logs[v], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 8b4c2e14..3bdfc061 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -35,7 +35,7 @@ //#define MAX_DESC union IxgbeLogEntry { - long long data[12]; + long long data[11]; struct { long long tsc; long long ninstructions; @@ -50,9 +50,7 @@ union IxgbeLogEntry { int rx_desc; int rx_bytes; int tx_desc; - int tx_bytes; - - long long pad; + int tx_bytes; } __attribute((packed)) Fields; } __attribute((packed)); @@ -294,6 +292,7 @@ class IxgbeDriver : public EthernetDevice { uint32_t itr_val{8}; std::chrono::nanoseconds itr_joules_last_ts{0}; bool collect_stats{false}; + bool start_perf{false}; std::vector tx_desc_counts; std::vector rx_desc_counts; @@ -583,6 +582,7 @@ class IxgbeDriverRep : public MulticoreEbb, Timer:: void WriteEimcn(uint32_t n, uint32_t m); void WriteEimc(uint32_t m); void WriteEims(uint32_t m); + uint32_t ReadEicr(); uint32_t ReadTdh_1(uint32_t n); uint32_t ReadTdt_1(uint32_t n); uint32_t GetRxBuf(uint32_t* len, uint64_t* bAddr, uint64_t* rxflag, From aab9a19f9996c44a97752017721dc9a6bc1e6943 Mon Sep 17 00:00:00 2001 From: Han Date: Tue, 15 Dec 2020 14:32:53 -0500 Subject: [PATCH 20/20] added some couunters at various eventmanager and driver locations --- src/native/EventManager.cc | 46 +++++++++++--- src/native/EventManager.h | 17 ++++- src/native/IxgbeDriver.cc | 126 +++++++++++++++++++++---------------- src/native/IxgbeDriver.h | 1 + src/native/Main.cc | 34 ++++++++++ src/native/Timer.cc | 11 ++++ 6 files changed, 171 insertions(+), 64 deletions(-) diff --git a/src/native/EventManager.cc b/src/native/EventManager.cc index f557bd61..eb8eda93 100644 --- a/src/native/EventManager.cc +++ b/src/native/EventManager.cc @@ -16,6 +16,19 @@ #include "Trace.h" #include "VMem.h" +uint32_t nsleep_states[16]; +uint32_t sleep_state[16]; +/*uint32_t processCnt[16]; +uint32_t swEventCnt[16]; +uint32_t idleEventCnt[16]; +uint32_t processInterruptCntAll[16]; +uint32_t processInterruptCntA[16]; +uint32_t processInterruptCntB[16]; +uint32_t processInterruptCntC[16]; +uint32_t passTokenCnt[16]; +uint32_t receiveTokenCnt[16]; +uint32_t genFireCnt[16];*/ + namespace { struct InterruptHandler { ebbrt::RcuHListHook hook; @@ -131,8 +144,9 @@ template void ebbrt::EventManager::InvokeFunction(F&& f) { void ebbrt::EventManager::Process() { auto stack_top = (active_event_context_.stack + kStackPages).ToAddr(); + uint32_t mycpu = static_cast(Cpu::GetMine()); Cpu::GetMine().SetEventStack(stack_top); - unsigned long ecx, edx, eax; + uint32_t ecx, edx, eax; ecx = edx = eax = 0; // process an interrupt without halting @@ -140,26 +154,29 @@ void ebbrt::EventManager::Process() { // instruction is executed (to allow for a halt for example). The nop gives us // a one instruction window to process an interrupt (before the cli) process: + //processCnt[mycpu]++; asm volatile("sti;" "nop;" "cli;"); // If an interrupt was processed then we would not reach this code (the // interrupt does not return here but instead to the top of this function) - //ebbrt::kprintf_force("p1\n"); if (!tasks_.empty()) { auto f = std::move(tasks_.front()); tasks_.pop_front(); InvokeFunction(f); + //swEventCnt[mycpu]++; // if we had a task to execute, then we go to the top again goto process; } if (idle_callback_) { + //idleEventCnt[mycpu]++; InvokeFunction(*idle_callback_); goto process; } - + + nsleep_states[mycpu] ++; asm volatile(".byte 0x0f, 0x01, 0xc8;" :: "a" ((void*)&flags), "c" (ecx), "d"(edx)); @@ -173,7 +190,7 @@ void ebbrt::EventManager::Process() { // C6 0x20 // C7 0x30 ecx = 1; /* break on interrupt flag */ - eax = 0x30; /* we always pick the deepest sleep state */ + eax = sleep_state[mycpu]; /* we always pick the deepest sleep state */ asm volatile("sti; .byte 0x0f, 0x01, 0xc9;" :: "a" (eax), "c" (ecx)); @@ -199,7 +216,7 @@ void ebbrt::EventManager::FreeStack(Pfn stack) { free_stacks_.push(stack); } static_assert(ebbrt::Cpu::kMaxCpus <= 256, "adjust event id calculation"); ebbrt::EventManager::EventManager(const RepMap& rm) - : reps_(rm), next_event_id_(Cpu::GetMine() << 24), + : reps_(rm), next_event_id_(Cpu::GetMine() << 24), active_event_context_(next_event_id_++, AllocateStack()) {} void ebbrt::EventManager::Spawn(MovableFunction func, @@ -352,20 +369,27 @@ uint8_t ebbrt::EventManager::AllocateVector(MovableFunction func) { } void ebbrt::EventManager::ProcessInterrupt(int num) { + //uint32_t mycpu = static_cast(Cpu::GetMine()); apic::Eoi(); + //processInterruptCntAll[mycpu]++; + if (num == 32) { // pull all remote tasks onto our queue std::lock_guard l(remote_.lock); tasks_.splice(tasks_.end(), std::move(remote_.tasks)); + //processInterruptCntA[mycpu]++; + } else if (num == 33) { + //processInterruptCntB[mycpu]++; ReceiveToken(); } else { + //processInterruptCntC[mycpu]++; auto ih = vec_data->map.find(num); kassert(ih != nullptr); auto& f = ih->func; InvokeFunction(f); } - //ebbrt::kprintf_force("ProcessInterrupt %d\n", num); + //ebbrt::kprintf_force("ProcessInterrupt %d\n", num);OA Process(); } @@ -404,7 +428,9 @@ ebbrt::EventManager::EventContext::EventContext(uint32_t event_id, Pfn stack) void ebbrt::EventManager::PassToken() { size_t my_cpu_index = Cpu::GetMine(); + //uint32_t mycpu = static_cast(Cpu::GetMine()); if (Cpu::Count() > 1) { + //passTokenCnt[mycpu] ++; auto next_cpu_index = (my_cpu_index + 1) % Cpu::Count(); auto next_cpu = Cpu::GetByIndex(next_cpu_index); kassert(next_cpu != nullptr); @@ -415,14 +441,18 @@ void ebbrt::EventManager::PassToken() { } void ebbrt::EventManager::ReceiveToken() { + //uint32_t mycpu = static_cast(Cpu::GetMine()); pending_generation_ = generation_++; - + //receiveTokenCnt[mycpu] ++; StartTimer(); } // Check Generation void ebbrt::EventManager::Fire() { - if (generation_count_[pending_generation_ % 2] == 0) { + //uint32_t mycpu = static_cast(Cpu::GetMine()); + //genFireCnt[mycpu] ++; + + if (generation_count_[pending_generation_ % 2] == 0) { // generation complete PassToken(); // temporarily store tasks that have now lived at least one entire diff --git a/src/native/EventManager.h b/src/native/EventManager.h index 257f008c..21a831f5 100644 --- a/src/native/EventManager.h +++ b/src/native/EventManager.h @@ -23,7 +23,22 @@ #include "Trans.h" #include "VMemAllocator.h" -extern uint64_t nsleep_states; +extern uint32_t nsleep_states[16]; +extern uint32_t sleep_state[16]; +/*extern uint32_t processCnt[16]; +extern uint32_t swEventCnt[16]; +extern uint32_t idleEventCnt[16]; +extern uint32_t processInterruptCntAll[16]; +extern uint32_t processInterruptCntA[16]; +extern uint32_t processInterruptCntB[16]; +extern uint32_t processInterruptCntC[16]; +extern uint32_t passTokenCnt[16]; +extern uint32_t receiveTokenCnt[16]; +extern uint32_t genFireCnt[16]; +extern uint32_t timerCnt[16]; +extern uint32_t fireCntA[16]; +extern uint32_t fireCntB[16]; +*/ namespace ebbrt { diff --git a/src/native/IxgbeDriver.cc b/src/native/IxgbeDriver.cc index 6424cb86..d15938ac 100644 --- a/src/native/IxgbeDriver.cc +++ b/src/native/IxgbeDriver.cc @@ -22,6 +22,7 @@ struct IxgbeLog ixgbe_stats[16]; union IxgbeLogEntry *ixgbe_logs[16]; std::unique_ptr bsendbufs[16]; +//uint64_t rxPollCnt[16]; void ebbrt::IxgbeDriver::Create(pci::Device& dev) { auto ixgbe_dev = new IxgbeDriver(dev); @@ -1854,9 +1855,8 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+30, 0x03020100); WriteReta(i+31, 0x07060504); */ - uint32_t ncore = static_cast(Cpu::Count()); - for (auto i = 0; i < 32; i += 4) { + for (auto i = 0; i < 32; i += 4) { // all route to core 0 if(ncore == 1) { WriteReta(i, 0x0000000); @@ -1885,11 +1885,10 @@ void ebbrt::IxgbeDriver::Init() { WriteReta(i+3, 0x7060504); } else if(ncore == 16){ // memcached - /*WriteReta(i+0, 0x03020100); - WriteReta(i+1, 0x07060504); - WriteReta(i+2, 0x0B0A0908); - WriteReta(i+3, 0x0F0E0D0C); - */ + //WriteReta(i+0, 0x03020100); + //WriteReta(i+1, 0x07060504); + //WriteReta(i+2, 0x0B0A0908); + //WriteReta(i+3, 0x0F0E0D0C); // nodejs -- all on core 1 ebbrt::kprintf_force("*** NodeJS firing all on core 1\n"); @@ -2224,9 +2223,11 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { uint64_t cjoules, cins, ccyc, crefcyc, cllc; uint64_t c3, c6, c7; //uint32_t eicr; - + + //rxPollCnt[mcore]++; c3 = c6 = c7 = 0; - + + // hard coded for this processor to initialize PMC counter if(ixgmq_.start_perf == false) { uint32_t index, low, high; uint64_t data; @@ -2248,15 +2249,15 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { low = (uint32_t)(data & 0xFFFFFFFF); high = (data >> 32) & 0xFFFFFFFF; asm volatile("wrmsr" : : "c"(index), "a"(low), "d"(high)); - + ixgmq_.start_perf = true; } - + if(ixgmq_.collect_stats) { ccyc = 0; cllc = 0; icnt = ixgbe_stats[mcore].itr_cnt; - ixgbe_stats[mcore].itr_cnt2 ++; +// ixgbe_stats[mcore].itr_cnt2 ++; if (icnt < IXGBE_LOG_SIZE) { //get current tsc and store it @@ -2266,69 +2267,64 @@ void ebbrt::IxgbeDriverRep::ReceivePoll() { //eicr = ReadEicr(); //ixgbe_logs[mcore][icnt].Fields.c3 = eicr; - //__builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); - ixgbe_logs[mcore][icnt].Fields.tsc = now; + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.tsc), now); + //ixgbe_logs[mcore][icnt].Fields.tsc = now; // get last tsc last = ixgbe_stats[mcore].itr_joules_last_tsc; // ~ 1 ms has passed - if ((now - last) > TSC_KHZ) { + if ((now - last) > TSC_KHZ) { cjoules = ixgmq_.powerMeter.ReadMsr(); - if (ixgmq_.start_perf) { - cins = ebbrt::msr::Read(0x309); - ccyc = ebbrt::msr::Read(0x30A); - crefcyc = ebbrt::msr::Read(0x30B); - cllc = ebbrt::msr::Read(0xC1); - } - //cins = ixgmq_.perfInst.Read(); - //ccyc = ixgmq_.perfCycles.Read(); - //crefcyc = ixgmq_.perfRefCycles.Read(); - //cllc = ixgmq_.perfLLC_miss.Read(); - + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); + //ixgbe_logs[mcore][icnt].Fields.joules = cjoules; + //c3 = ebbrt::msr::Read(0x3FC); //c6 = ebbrt::msr::Read(0x3FD); //c7 = ebbrt::msr::Read(0x3FE); - - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.joules), cjoules); - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ninstructions), cins); - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ncycles), ccyc); - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nref_cycles), crefcyc); - __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nllc_miss), cllc); - - /*ixgbe_logs[mcore][icnt].Fields.joules = cjoules; - ixgbe_logs[mcore][icnt].Fields.ninstructions = cins; - ixgbe_logs[mcore][icnt].Fields.ncycles = ccyc; - ixgbe_logs[mcore][icnt].Fields.nref_cycles = crefcyc; - ixgbe_logs[mcore][icnt].Fields.nllc_miss = cllc;*/ - + c7 = nsleep_states[mcore]; __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c3), c3); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c6), c6); __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.c7), c7); + //ixgbe_logs[mcore][icnt].Fields.c3 = c3; + //ixgbe_logs[mcore][icnt].Fields.c6 = c6; + //ixgbe_logs[mcore][icnt].Fields.c7 = c7; + + if (ixgmq_.start_perf) { + cins = ebbrt::msr::Read(0x309); + ccyc = ebbrt::msr::Read(0x30A); + crefcyc = ebbrt::msr::Read(0x30B); + cllc = ebbrt::msr::Read(0xC1); - /*ixgbe_logs[mcore][icnt].Fields.c3 = c3; - ixgbe_logs[mcore][icnt].Fields.c6 = c6; - ixgbe_logs[mcore][icnt].Fields.c7 = c7; - */ + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ninstructions), cins); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.ncycles), ccyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nref_cycles), crefcyc); + __builtin_ia32_movnti64(&(ixgbe_logs[mcore][icnt].Fields.nllc_miss), cllc); + //ixgbe_logs[mcore][icnt].Fields.ninstructions = cins; + //ixgbe_logs[mcore][icnt].Fields.ncycles = ccyc; + //ixgbe_logs[mcore][icnt].Fields.nref_cycles = crefcyc; + //ixgbe_logs[mcore][icnt].Fields.nllc_miss = cllc; + } ixgbe_stats[mcore].itr_joules_last_tsc = now; - } - + } + + //ixgbe_logs[mcore][icnt].Fields.rx_desc = ixgmq_.stat_num_rx_desc; + //ixgbe_logs[mcore][icnt].Fields.rx_bytes = ixgmq_.stat_num_rx_bytes; + //ixgbe_logs[mcore][icnt].Fields.tx_desc = ixgmq_.stat_num_tx_desc; + //ixgbe_logs[mcore][icnt].Fields.tx_bytes = ixgmq_.stat_num_tx_bytes; __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_desc), ixgmq_.stat_num_rx_desc); __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.rx_bytes), ixgmq_.stat_num_rx_bytes); __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_desc), ixgmq_.stat_num_tx_desc); __builtin_ia32_movnti(&(ixgbe_logs[mcore][icnt].Fields.tx_bytes), ixgmq_.stat_num_tx_bytes); - - /*ixgbe_logs[mcore][icnt].Fields.rx_desc = ixgmq_.stat_num_rx_desc; - ixgbe_logs[mcore][icnt].Fields.rx_bytes = ixgmq_.stat_num_rx_bytes; - ixgbe_logs[mcore][icnt].Fields.tx_desc = ixgmq_.stat_num_tx_desc; - ixgbe_logs[mcore][icnt].Fields.tx_bytes = ixgmq_.stat_num_tx_bytes;*/ - + ixgmq_.stat_num_rx_bytes = 0; ixgmq_.stat_num_rx_desc = 0; ixgmq_.stat_num_tx_bytes = 0; ixgmq_.stat_num_tx_desc = 0; + ixgbe_stats[mcore].itr_cnt++; + nsleep_states[mcore] = 0; } } @@ -2634,6 +2630,9 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ixgmq[i]->powerMeter.SetLimit(v); ebbrt::kprintf_force("%u: rapl = %u\n", i, v); } + } else if(s == "sleep_state") { + sleep_state[i] = v; + ebbrt::kprintf_force("IxgbeDriver sleep_state[%u] = 0x%x\n", i, v); } else if(s == "start_stats") { //ebbrt::kprintf_force("start_stats on core %u\n", v); ixgmq[v]->collect_stats = true; @@ -2659,7 +2658,7 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { //ixgmq[v]->perfInst.Clear(); //ixgmq[v]->perfLLC_miss.Clear(); ixgmq[v]->powerMeter.Clear(); - + //memset(ixgbe_logs[v], 0, sizeof(IXGBE_LOG_SIZE * sizeof(union IxgbeLogEntry))); for (uint32_t i = 0; i < ixgbe_stats[v].itr_cnt; i++) { ixgbe_logs[v][i].Fields.rx_desc=0; @@ -2675,17 +2674,34 @@ void ebbrt::IxgbeDriver::Config(std::string s, uint32_t v) { ixgbe_logs[v][i].Fields.c7=0; ixgbe_logs[v][i].Fields.joules=0; ixgbe_logs[v][i].Fields.tsc=0; - } + } ixgbe_stats[v].itr_joules_last_tsc = 0; ixgbe_stats[v].itr_cnt =0; - ixgbe_stats[v].itr_cnt2 =0; + //ixgbe_stats[v].itr_cnt2 =0; ixgbe_stats[v].rdtsc_start = 0; ixgbe_stats[v].rdtsc_end = 0; ixgbe_stats[v].repeat =0; ixgbe_stats[v].dvfs =0; ixgbe_stats[v].rapl =0; ixgbe_stats[v].itr =0; - ixgbe_stats[v].iter =0; + ixgbe_stats[v].iter =0; + + // clear up counters + memset(nsleep_states, 0, sizeof(nsleep_states)); + /*memset(processCnt, 0, sizeof(processCnt)); + memset(swEventCnt, 0, sizeof(swEventCnt)); + memset(idleEventCnt, 0, sizeof(idleEventCnt)); + memset(rxPollCnt, 0, sizeof(rxPollCnt)); + memset(processInterruptCntAll, 0, sizeof(processInterruptCntAll)); + memset(processInterruptCntA, 0, sizeof(processInterruptCntA)); + memset(processInterruptCntB, 0, sizeof(processInterruptCntB)); + memset(processInterruptCntC, 0, sizeof(processInterruptCntC)); + memset(passTokenCnt, 0, sizeof(passTokenCnt)); + memset(receiveTokenCnt, 0, sizeof(receiveTokenCnt)); + memset(genFireCnt, 0, sizeof(genFireCnt)); + memset(fireCntA, 0, sizeof(fireCntA)); + memset(fireCntB, 0, sizeof(fireCntB));*/ + } else { ebbrt::kprintf_force("%s Unknown command: %s\n", __FUNCTION__, s.c_str()); } diff --git a/src/native/IxgbeDriver.h b/src/native/IxgbeDriver.h index 3bdfc061..6cdbfc5f 100644 --- a/src/native/IxgbeDriver.h +++ b/src/native/IxgbeDriver.h @@ -74,6 +74,7 @@ struct IxgbeLog { extern struct IxgbeLog ixgbe_stats[16]; extern union IxgbeLogEntry *ixgbe_logs[16]; extern std::unique_ptr bsendbufs[16]; +//extern uint64_t rxPollCnt[16]; namespace ebbrt { diff --git a/src/native/Main.cc b/src/native/Main.cc index 382eaad6..dfed9d9a 100644 --- a/src/native/Main.cc +++ b/src/native/Main.cc @@ -70,6 +70,40 @@ extern "C" __attribute__((noreturn)) void ebbrt::Main(multiboot::Information* mbi) { console::Init(); + memset(nsleep_states, 0, sizeof(nsleep_states)); + memset(sleep_state, 0, sizeof(sleep_state)); + sleep_state[0]=48; + sleep_state[1]=48; + sleep_state[2]=48; + sleep_state[3]=48; + sleep_state[4]=48; + sleep_state[5]=48; + sleep_state[6]=48; + sleep_state[7]=48; + sleep_state[8]=48; + sleep_state[9]=48; + sleep_state[10]=48; + sleep_state[11]=48; + sleep_state[12]=48; + sleep_state[13]=48; + sleep_state[14]=48; + sleep_state[15]=48; + + /*memset(processCnt, 0, sizeof(processCnt)); + memset(swEventCnt, 0, sizeof(swEventCnt)); + memset(idleEventCnt, 0, sizeof(idleEventCnt)); + memset(rxPollCnt, 0, sizeof(rxPollCnt)); + memset(processInterruptCntAll, 0, sizeof(processInterruptCntAll)); + memset(processInterruptCntA, 0, sizeof(processInterruptCntA)); + memset(processInterruptCntB, 0, sizeof(processInterruptCntB)); + memset(processInterruptCntC, 0, sizeof(processInterruptCntC)); + memset(passTokenCnt, 0, sizeof(passTokenCnt)); + memset(receiveTokenCnt, 0, sizeof(receiveTokenCnt)); + memset(genFireCnt, 0, sizeof(genFireCnt)); + memset(timerCnt, 0, sizeof(timerCnt)); + memset(fireCntA, 0, sizeof(fireCntA)); + memset(fireCntB, 0, sizeof(fireCntB));*/ + #ifdef __EBBRT_ENABLE_TRACE__ trace::Init(); #endif diff --git a/src/native/Timer.cc b/src/native/Timer.cc index 6069be7d..a019f49a 100644 --- a/src/native/Timer.cc +++ b/src/native/Timer.cc @@ -10,9 +10,19 @@ const constexpr ebbrt::EbbId ebbrt::Timer::static_id; +//uint32_t timerCnt[16]; +//uint32_t fireCntA[16]; +//uint32_t fireCntB[16]; + ebbrt::Timer::Timer() { + //uint32_t mycpu = static_cast(Cpu::GetMine()); + + //timerCnt[mycpu] += 1; +// auto interrupt = event_manager->AllocateVector([this, mycpu]() { auto interrupt = event_manager->AllocateVector([this]() { auto now = clock::Wall::Now().time_since_epoch(); + //fireCntA[mycpu] ++; + while (!timers_.empty() && timers_.begin()->fire_time_ <= now) { auto& hook = *timers_.begin(); @@ -25,6 +35,7 @@ ebbrt::Timer::Timer() { timers_.insert(hook); } + //fireCntB[mycpu] ++; hook.Fire(); now = clock::Wall::Now().time_since_epoch();